<?php
/* 

This sourcecode is licensed under the GPL v2.0.

The actual parser (readfromString()) was written by Serge, and slightly modifiedby Nico.  The rest of the code is by Nico.


http://bioinformatics.org/biophp - a "BioPHP" project
*/

class parse_spidey
{
    var $source;                //might be a string, or file handle(resource)
    var $source_lines=Array();  //if source is a string, this holds its lines
    var $source_fh_opened;      //flag is set when a file handle was opened in parser
    var $eoRecords;             //flag is set when we reach the last record
    var $mrna_features;
 

//################class constructor###################

    function parse_spidey(&$source) 
    {
        if($source != "") {
            $this->setSource($source);
        }
    }

//###############Get/Set functions##################

    // return the next sequence record as an array,
    function fetchNext() 
    {
        if(!$this->eoRecords) {
            return $this->readRecord();
        }
        return false;
    }


    // declare or change the source data
    // can be an actual string containing data, a file handle, or
    // a filename.
    function setSource($source) {
        if(is_resource($source)) {
            $this->source=$source;
        }  elseif(is_array($source)) {
            //assume an already-split array of lines
            $this->source_lines=$source;
        } elseif(@file_exists($source)) {
            //if passed a filename, opens it
            $this->source=fopen($source,"r");
            if($this->source) {
                $this->source_fh_opened=true;
            }
        } else {
            // assume source is a string containing data
            $this->source_lines=preg_split("/[\r\n]/",$source);
        }
    }

//################"Internal" functions#############

    function readRecord() 
    {
        // a "wrapper" - calls the appropriate function for string or file
        if(is_resource($this->source)) {
            return $this->readfromFile();
        } else {
            return $this->readfromString($this->source_lines);
        }
    }

    // Test if this line contains the start of another result : --SPIDEY version 1.35--
    // Actually it is the first line of the next result in case of sevral results
    //so I did a little trck with the counter 
    // in the function readfromFile() (see below)
    function isEor($line, $line_cpt) 
    {
        // Genbank records end with a double slash
        if (ereg("--SPIDEY version",$line) && $line_cpt > 1) {         
            return true;
        }
        return false;
    }


    // read to next label or end of file, return an array with label and
    // sequence,  or false if no more records
    function readfromFile() 
    {
        // we keep on reading untill we hit an end of record mark
        // accumulate read lines in an array that we feed to readfromString
        $line_cpt = 0;
        while (!(feof($this->source) || $this->isEor($line, $line_cpt)) ) {
            $line_cpt++;
            $line=fgets($this->source,2048);
            $lines[]=$line;
        }
        if (feof($this->source)) {
            $this->eoRecords=true;
            if ($this->source_fh_opened) {
                fclose($this->source);
            }
        }
        return $this->readfromString($lines);
    }


/*
parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot
data file, extracts the feature key name, from endpoint, to endpoint, and description, and
stores them in a (simple) array.  

process_ft() then pushes this array into a larger associative array, called $swiss, which is 
also an attribute of the Seq object. It is assigned a key of the form: FT_<feature_key_name>.
Examples are: FT_PEPTIDE, FT_DISULFID.

    function process_ft(&$swiss, $ft_r)
    {
        foreach($ft_r as $element) {
            $index = "FT_" . $element[0];
            array_shift($element);                    
            if (count($swiss[$index]) == 0) {
                $swiss[$index] = array();
                array_push($swiss[$index], $element);
            }
            else 
                array_push($swiss[$index], $element); 
        }
    }
*/        

    // parses a Spidey data file 
    function readfromString(&$sourcelines) 
    {
        
        $record = array();
        $genomic_description ;
        $mrna_description;
        $strand;
        $nbexons;
        $exons = array();
        $exons_location = array();
        $nb_exons = 0;

        while (list($no, $linestr) = each($sourcelines)) {

            if (left($linestr, 7) == "Genomic") {
                $genomic_description = substr($linestr, 9);
            }

            if (left($linestr, 4) == "mRNA") {
                $mrna_description = substr($linestr, 10);
            }

            if (left($linestr, 6) == "Strand") {
                $strand = rtrim(substr($linestr, 8));
            }
            if (left($linestr, 15) == "Number of exons") {
                $nbexons = substr($linestr, 17);
            }

            if (ereg("^Exon ([[:digit:]]+)\(?-?\)?: ([[:digit:]]+)-([[:digit:]]+) \(gen\) +([[:digit:]]+)-([[:digit:]]+) \(mRNA\) +id ([[:digit:]]+\.[[:digit:]]+%) mismatches ([[:digit:]]+) gaps ([[:digit:]]+) +splice site \(d  a\): +(.+)", $linestr, $infos)) {
                

                $exons[$infos[1]] = array (
 
                    'genomic_start'                     => $infos[2],
                    'genomic_end'                       => $infos[3],
                    'mrna_start'                        => $infos[4],
                    'mrna_end'                          => $infos[5],
                    'id'                                => $infos[6],
                    'mismatch'                          => $infos[7],
                    'gaps'                              => $infos[8],
                    'splice_site_donor_acceptor'        => $infos[9]
                );

               $exons_location[] =     $infos[2] . "-" .  $infos[3];
              
            }

            if (left($linestr, 22) == "Number of splice sites") {
                $nbsplicesites = substr($linestr, 24);
            }

            if (left($linestr, 13) == "mRNA coverage") {
                $mrnacoverage = substr($linestr, 15);
            }

            if (left($linestr, 24) == "overall percent identity") {
                $overallpercentid = substr($linestr, 26);
            }

            if (left($linestr, 17) == "Missing mRNA ends") {
                $missingmrnaends = substr($linestr, 19);
            }

            if (left($linestr, 33) == "Non-aligning poly(A)+ tail length") {
                $polyatailslength = substr($linestr, 35);
            }
                        

        } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) 

                
        $record[genomic]          = $genomic_description  ; 
        $record[mrna]             = $mrna_description  ;
        $record[strand]           = $strand  ;
        $record[nbexons]          = $nbexons  ;
        $record[exons]            = $exons;
        $record[exons_location]   = $exons_location;
        $record[nbsplicesites]    = $nbsplicesites  ; 
        $record[mrnacoverage]     = $mrnacoverage  ;
        $record[overallpercentid] = $overallpercentid  ;
        $record[missingmrnaends]  = $missingmrnaends  ;
        $record[polyatailslength] = $polyatailslength  ;

        //The following method build an array for the rna features of a genomic sequence
        $this->seqRnaFeatures($record);

        return $record;                        
    } // CLOSES parse_spidey()


    function seqRnaFeatures($record) {
        
        $location = $record[exons_location];

        //db_xref part
        $db_xref = array();
        if (ereg('gi\|([[:digit:]]+)\|', $record[mrna], $tab)) $db_xref[] = "GI=$tab[1]";
        if (ereg('ref\|([[:alpha:]]{2}_[[:digit:]]+\.[[:digit:]]+)\|', $record[mrna], $tab)) $db_xref[] = "Refseq=$tab[1]";
        if (ereg('(NM_[[:digit:]]+) ', $record[mrna], $tab)) $db_xref[] = "Refseq=$tab[1]";
        if (ereg('(ENST[[:digit:]]+) ', $record[mrna], $tab)) $db_xref[] = "Ensembl=$tab[1]";
        if (ereg('(.+_at)', $record[mrna], $tab)) $db_xref[] = "Affymetrix=$tab[1]";

        $this->mrna_features = array(
            'acc'      => $tab[1],
            'location' => $record[exons_location],
            'strand'   => $record[strand]
        );

    }

}
?>