setSource($source); } } //###############Get/Set functions################## // return the next sequence record as an array, function fetchNext() { if(!$this->eoRecords) { return $this->readRecord(); } return false; } // declare or change the source data // can be an actual string containing data, a file handle, or // a filename. function setSource($source) { if(is_resource($source)) { $this->source=$source; } elseif(is_array($source)) { //assume an already-split array of lines $this->source_lines=$source; } elseif(@file_exists($source)) { //if passed a filename, opens it $this->source=fopen($source,"r"); if($this->source) { $this->source_fh_opened=true; } } else { // assume source is a string containing data $this->source_lines=preg_split("/[\r\n]/",$source); } } //################"Internal" functions############# function readRecord() { // a "wrapper" - calls the appropriate function for string or file if(is_resource($this->source)) { return $this->readfromFile(); } else { return $this->readfromString($this->source_lines); } } // Test if this line contains the start of another result : --SPIDEY version 1.35-- // Actually it is the first line of the next result in case of sevral results //so I did a little trck with the counter // in the function readfromFile() (see below) function isEor($line, $line_cpt) { // Genbank records end with a double slash if (ereg("--SPIDEY version",$line) && $line_cpt > 1) { return true; } return false; } // read to next label or end of file, return an array with label and // sequence, or false if no more records function readfromFile() { // we keep on reading untill we hit an end of record mark // accumulate read lines in an array that we feed to readfromString $line_cpt = 0; while (!(feof($this->source) || $this->isEor($line, $line_cpt)) ) { $line_cpt++; $line=fgets($this->source,2048); $lines[]=$line; } if (feof($this->source)) { $this->eoRecords=true; if ($this->source_fh_opened) { fclose($this->source); } } return $this->readfromString($lines); } /* parse_swissprot() parses the Feature Table lines (those that begin with FT) in a Swissprot data file, extracts the feature key name, from endpoint, to endpoint, and description, and stores them in a (simple) array. process_ft() then pushes this array into a larger associative array, called $swiss, which is also an attribute of the Seq object. It is assigned a key of the form: FT_. Examples are: FT_PEPTIDE, FT_DISULFID. function process_ft(&$swiss, $ft_r) { foreach($ft_r as $element) { $index = "FT_" . $element[0]; array_shift($element); if (count($swiss[$index]) == 0) { $swiss[$index] = array(); array_push($swiss[$index], $element); } else array_push($swiss[$index], $element); } } */ // parses a Spidey data file function readfromString(&$sourcelines) { $record = array(); $genomic_description ; $mrna_description; $strand; $nbexons; $exons = array(); $exons_location = array(); $nb_exons = 0; while (list($no, $linestr) = each($sourcelines)) { if (left($linestr, 7) == "Genomic") { $genomic_description = substr($linestr, 9); } if (left($linestr, 4) == "mRNA") { $mrna_description = substr($linestr, 10); } if (left($linestr, 6) == "Strand") { $strand = rtrim(substr($linestr, 8)); } if (left($linestr, 15) == "Number of exons") { $nbexons = substr($linestr, 17); } if (ereg("^Exon ([[:digit:]]+)\(?-?\)?: ([[:digit:]]+)-([[:digit:]]+) \(gen\) +([[:digit:]]+)-([[:digit:]]+) \(mRNA\) +id ([[:digit:]]+\.[[:digit:]]+%) mismatches ([[:digit:]]+) gaps ([[:digit:]]+) +splice site \(d a\): +(.+)", $linestr, $infos)) { $exons[$infos[1]] = array ( 'genomic_start' => $infos[2], 'genomic_end' => $infos[3], 'mrna_start' => $infos[4], 'mrna_end' => $infos[5], 'id' => $infos[6], 'mismatch' => $infos[7], 'gaps' => $infos[8], 'splice_site_donor_acceptor' => $infos[9] ); $exons_location[] = $infos[2] . "-" . $infos[3]; } if (left($linestr, 22) == "Number of splice sites") { $nbsplicesites = substr($linestr, 24); } if (left($linestr, 13) == "mRNA coverage") { $mrnacoverage = substr($linestr, 15); } if (left($linestr, 24) == "overall percent identity") { $overallpercentid = substr($linestr, 26); } if (left($linestr, 17) == "Missing mRNA ends") { $missingmrnaends = substr($linestr, 19); } if (left($linestr, 33) == "Non-aligning poly(A)+ tail length") { $polyatailslength = substr($linestr, 35); } } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $record[genomic] = $genomic_description ; $record[mrna] = $mrna_description ; $record[strand] = $strand ; $record[nbexons] = $nbexons ; $record[exons] = $exons; $record[exons_location] = $exons_location; $record[nbsplicesites] = $nbsplicesites ; $record[mrnacoverage] = $mrnacoverage ; $record[overallpercentid] = $overallpercentid ; $record[missingmrnaends] = $missingmrnaends ; $record[polyatailslength] = $polyatailslength ; //The following method build an array for the rna features of a genomic sequence $this->seqRnaFeatures($record); return $record; } // CLOSES parse_spidey() function seqRnaFeatures($record) { $location = $record[exons_location]; //db_xref part $db_xref = array(); if (ereg('gi\|([[:digit:]]+)\|', $record[mrna], $tab)) $db_xref[] = "GI=$tab[1]"; if (ereg('ref\|([[:alpha:]]{2}_[[:digit:]]+\.[[:digit:]]+)\|', $record[mrna], $tab)) $db_xref[] = "Refseq=$tab[1]"; if (ereg('(NM_[[:digit:]]+) ', $record[mrna], $tab)) $db_xref[] = "Refseq=$tab[1]"; if (ereg('(ENST[[:digit:]]+) ', $record[mrna], $tab)) $db_xref[] = "Ensembl=$tab[1]"; if (ereg('(.+_at)', $record[mrna], $tab)) $db_xref[] = "Affymetrix=$tab[1]"; $this->mrna_features = array( 'acc' => $tab[1], 'location' => $record[exons_location], 'strand' => $record[strand] ); } } ?>