ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gfflt
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 15938 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gfflt [-c <mincov>] [-p <minpid>] [-P] [-l <fsize_file>] [-s<strand>][{-C|-E}] \
8 [-D|-T] [-f <ftstr>] [-t <track>] [-r <start>-<end>] [-n<ID_suffix>] \
9 [-i <maxintron>] [-u dupIDs.tab] [-w <map_out_file>] [-o <outfile>]
10
11 Note: requires segment subfeatures (CDS\/exons) to be strictly
12 grouped with (i.e. follow) their parent mRNA feature
13
14 -l : file with extra mapping information, 3 tab-delimited columns
15 are expected in each line:
16 <qry_ID> <qry_len> <qry_description>
17 -P : only for PMAP output files, take "Target" coordinates
18 as nucleotide-based (divide by 3 to get the actual query protein
19 coordinates)
20
21 Filtering options:
22 -r : only show gff records fully included in between the given coordinates
23 -R : like -r but also show mappings that intersect the given range
24 -s : only show gff records on the given strand
25 -u : duplicate IDs should be discarded according to the given ID table
26 -c : discard mappings with coverage lower than <mincov>;
27 unless qreg or qlen attributes are found in the mRNA line,
28 this option requires the -l option to provide query lengths
29 -p : discard mappings with percent identity lower than <minpid>;
30 requires the score column or the PID attribute for the 'mRNA'
31 feature
32 -x : discard exon mappings with percent identity lower than <xminpid>;
33 (coverage and percent identity will be adjusted accordingly)
34 -C|-E : read only the CDS (-C) or only the exon segments (-E)
35 from the input file (default: read both);
36 Output modifiers:
37
38 -f : replace the output segment feature name ("CDS" or "exon")
39 with this string (useful for converting "CDS" into "exon" or
40 the other way around); requires -C or -E option
41 -n : add the given alphanumeric suffix followed by a counter
42 for each mapping of a query sequence (starting at 1)
43 (only the first 4 letters of <ID_suffix> will be used)
44 -N : for -n option, the whole ID is replaced with the given string
45 -M : make the "Name" attribute to have the same value with "ID" attribute
46 -t : change the track name (2nd GFF column) into given label
47 -D : add mapping descriptions ("descr=" attribute) as found in the
48 3rd column of the <fsize_file> (so it requires -l option)
49 -V : add "cov" and "pid" attributes to the mRNA lines if possible
50 and if not there already
51 -S : strip the output of any other attributes besides ID, Parent and Name
52 -T : write GTF format instead (this is incompatible with -D and -V)
53
54 /;
55 umask 0002;
56 my @inputfiles;
57 while ($ARGV[0] && $ARGV[0]!~m/^\-/) {
58 push(@inputfiles, shift(@ARGV));
59 }
60 getopts('i:t:r:R:l:s:c:x:p:w:f:u:n:o:PMSNCEDTV') || die($usage."\n");
61 unshift(@ARGV, @inputfiles) if @inputfiles>0;
62 #-- options
63 my $outfile=$Getopt::Std::opt_o;
64 my $dupfile=$Getopt::Std::opt_u;
65 my $maxintron=$Getopt::Std::opt_i || 450000;
66 my $stripAttrs=$Getopt::Std::opt_S;
67 my $fullRename=$Getopt::Std::opt_N;
68 my $resetName=$Getopt::Std::opt_M;
69 my $globalCounter=0;
70 my $inFeat;
71 $inFeat='exon' if $Getopt::Std::opt_E;
72 $inFeat='CDS' if $Getopt::Std::opt_C;
73 my $pmap=$Getopt::Std::opt_P;
74 my %dups; #duplicate IDs are kept here -- $dups{$id} => $mainid
75 my %uniq; #key is ctgid_qryid_startcoord_endcoord_covscore
76 #my $startOnly=$Getopt::Std::opt_S;
77 my ($minpid, $mincov, $addCounter, $addDescr, $outFeat, $GTF)=
78 ($Getopt::Std::opt_p, $Getopt::Std::opt_c, $Getopt::Std::opt_n,
79 $Getopt::Std::opt_D, $Getopt::Std::opt_f, $Getopt::Std::opt_T);
80 my $g_range = $Getopt::Std::opt_r || $Getopt::Std::opt_R;
81 my $xrange = 1 if ($g_range && $Getopt::Std::opt_R);
82 my ($mincoord, $maxcoord);
83 if ($g_range) {
84 ($mincoord, $maxcoord)=($g_range=~m/^(\d*)[\.\-_]+(\d*)$/);
85 $mincoord=1 unless $mincoord>0;
86 $maxcoord=4000000000 unless $maxcoord>0;
87 #print STDERR "range filter: $mincoord .. $maxcoord\n";
88 }
89 my $fltstrand=$Getopt::Std::opt_s;
90 my $xminpid = $Getopt::Std::opt_x;
91 my $ntrack=$Getopt::Std::opt_t;
92 $mincov=~tr/%//d;
93 $mincov=1 unless $mincov;
94 $minpid=20 unless $minpid;
95 $minpid=20 unless $minpid;
96 $xminpid=12 unless $xminpid;
97 my $fmapout=$Getopt::Std::opt_w;
98 if ($fmapout) {
99 open(FMAP, '>'.$fmapout) || die("Error creating file $fmapout\n");
100 }
101 my $fsizefile=$Getopt::Std::opt_l;
102 my %f;
103 my %n;
104 if ($fsizefile) {
105 open(FSIZE, $fsizefile) || die("Error opening query info file $fsizefile!\n");
106 while(<FSIZE>) {
107 chomp;
108 my @t=split(/\s+/,$_,3);
109 next unless $t[1];
110 $f{$t[0]}=[$t[1], $t[2]]; # seqlen, description
111 }
112 close(FSIZE);
113 }
114 if ($dupfile) {
115 open(DUPFILE, $dupfile) || die ("Error opening duplicate table file $dupfile!\n");
116 while (<DUPFILE>) {
117 chomp;
118 my @t=split(/\t/);
119 foreach my $id (@t) {
120 $dups{$id}=$t[0];
121 }
122 }
123 close(DUPFILE);
124 }
125 my $curqry; #current target (query) name
126 my ($curqstart, $curqend, $curqlen, $curcov, $curpid, $use_tcoords); #length of the current query
127 my $curdescr; #defline of the current query
128 my $covlen; #coverage of current query
129 my $eScore; #cummulated exon score
130 my ($gffid, $gchr, $gtrack, $gname, $gstart, $gend, $gscore, $gstrand, $ginfo);
131 # 0 1 2 3 4 5 6
132 my @exd; # list of [exon_start, exon_end, phase, score, qstart, qend, exon_other_attrs]
133 my @cds; # list of [cds_start, cds_end, phase, score, qstart, qend, cds_other_attrs]
134 if ($outfile) {
135 open(OUTFILE, '>'.$outfile) || die ("Error creating file $outfile\n");
136 select(OUTFILE);
137 }
138 while (<>) {
139 next if m/^\s*#/;
140 chomp;
141 my ($chr, $binfo, $ftype, $fstart, $fend, $fscore,
142 $strand, $phase, $finfo)=split(/\t/);
143 next if $ftype eq 'gene'; # just discard the superfluous 'gene' entries;
144 if ($ftype eq 'mRNA') { #start of a new mapping
145 &flush_gbuf();
146 ($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $ginfo)=
147 ($chr, $binfo, $fstart, $fend, $fscore, $strand, $finfo);
148 ($gffid)=($ginfo=~m/ID=([^;]+)/);
149 die("Error finding the current ID (in attribute field: $finfo)\n") unless $gffid;
150 $ginfo=~s/ID=[^;]+\s*;?//;
151 $ginfo=~s/Parent=[^;]+\s*;?//;
152 if ($ginfo=~s/Name=([^;]+)\s*;?//) {
153 $gname=$1;
154 }
155 if ($ginfo=~m/qlen=(\d+)/i) {
156 $curqlen=$1;
157 $ginfo=~s/qlen=\d+\s*;?//i;
158 }
159 if ($ginfo=~m/qreg=(\d+)\-(\d+)/i) {
160 ($curqstart, $curqend)=($1,$2);
161 $curqstart=1 if $curqstart==0;
162 if ($ginfo=~m/qreg=\d+\-\d+\|(\d+)/i) {
163 $curqlen=$1;
164 $ginfo=~s/qreg=\d+\-\d+\|\d+\s*;?//i;
165 }
166 else {
167 $ginfo=~s/qreg=\d+\-\d+\s*;?//i;
168 }
169 }
170 if ($ginfo=~m/cov=([\d\.]+)/i) {
171 $curcov=$1;
172 $ginfo=~s/cov=[\d\.]+\s*;?//i;
173 }
174 if ($ginfo=~m/\bpid=([\d\.]+)/i) {
175 $curpid=$1;
176 $ginfo=~s/\bpid=[\d\.]+\s*;?//i;
177 }
178 if ($fscore>50 && $fscore<=100 && !$curpid) {
179 $curpid=$fscore;
180 }
181 next;
182 } # mRNA line
183 # vvvvvvvvv exons or CDS segments vvvvvvvvvvv
184 $ftype=~s/\w+\-exon$/exon/i; #jigsaw
185 next unless $ftype eq 'exon' || $ftype eq 'CDS';
186 my $gtfgeneid;
187 my $id;
188 if ($finfo=~m/(?:Parent|transcript_id)[=\s]+"?([^;^\s^"]+)/) {
189 $id=$1;
190 unless ($finfo=~s/Parent=[^;]+\s*;?//) {
191 #must be GTF format -- discard anything else but the gene_id, if found;
192 ($gtfgeneid)=($finfo=~m/gene_id[=\s]+"?([^;^\s^"]+)/);
193 # we could parse all extra GTF attributes here and rebuild them in GFF3 format..
194 # .. nah
195 $finfo="";
196 }
197 }
198 else { # jigsaw and other over-simplified GFF formats
199 my ($geneid)=($finfo=~m/gene_id[=\s]+"?([^;^\s^"]+)/);
200 if ($geneid) {
201 $id=$geneid;
202 $finfo="";
203 }
204 else {
205 ($id)=($finfo=~m/^([^;^\s]+)/);
206 if ($binfo=~m/^jigsaw/) {
207 ($finfo)=($finfo=~m/(gene_score=[\-\d\.]+)/);
208 }
209 else { $finfo=""; }
210 }
211 }
212 die("Error getting the current ID from attribute field: $finfo\n") unless $id;
213 $finfo=~s/ID=[^;]+\s*;?//;
214 $finfo=~s/Name=([^;]+)\s*;?//;
215 if (!$gffid || ($gffid ne $id) || ($gchr && $gchr ne $chr)) {
216 # start of new mapping here (change in gff ID or base sequence)
217 #print STDERR "gffid=$gffid, id=$id (gchr=$gchr)\n";
218 &flush_gbuf();
219 ($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $ginfo)=
220 ($chr, $binfo, $fstart, $fend, $fscore, $strand, $finfo);
221 $ginfo=~s/Target=[^;]+\s*;?//i;
222 $ginfo=~s/qreg=\d+\-\d+\s*;?//i;
223 $gname=$gtfgeneid;
224 }
225 $finfo=~s/gene_score=[\-\d\.]+\s*\;?//i;
226 $gffid=$id;
227 my ($target, $t1, $t2, $tstrand);
228 if ($finfo=~m/Target=([^;]+)/i) {
229 my $alninfo=$1;
230 ($target, $t1, $t2, $tstrand)=($alninfo=~m/^(\S+)\s+(\d+)\s+(\d+)\s+([\-+])/);
231 $finfo=~s/Target=[^;]+\s*;?//i;
232 $use_tcoords=1;
233 }
234 else {
235 if ($finfo=~m/qreg=(\d+)\-(\d+)/i) {
236 ($t1,$t2)=($1,$2);
237 $t1=1 if $t1==0;
238 $finfo=~s/qreg=\d+\-\d+\s*;?//i;
239 }
240 $target=$gffid;
241 $target=~s/\.[a-z]{1,5}\d+$//;
242 }
243 if ($curqry) {
244 die("Error: CDS/exon Target entry found out of place ($_)\n")
245 if ($target ne $curqry);
246 }
247 else {
248 $curqry=$target; #first time assignment of curqry
249 if ($fsizefile) {
250 # if ($mincov) {
251 my $qd=$f{$curqry};
252 die("Error getting data for $curqry from $fsizefile!\n") unless $qd;
253 ($curqlen, $curdescr)=@$qd;
254 #print STDERR "=====> descr for $curqry: $curdescr\n";
255 }
256 $covlen=0;
257 } # first time assignment of curqry
258 $phase='.' if $phase<0;
259 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
260 ($t1, $t2)=($t2, $t1) if ($t2<$t1);
261 if ($t2 && $pmap) {
262 ($t1, $t2)=(int($t1/3), int($t2/3));
263 $t1=1 if $t1<1;
264 };
265 if ($ftype eq 'exon') {
266 # 0 1 2 3 4 5 6
267 push(@exd, [$fstart, $fend, $phase, $fscore, $t1, $t2, $finfo]);
268 next;
269 }
270 else { # ftype must be CDS here
271 # --exon entries should come first ?
272 # die("Error: missing query heading for: $_\n") unless $curqry;
273 #next if ($minpid && $fscore<$minpid);
274 push(@cds, [$fstart, $fend, $phase, $fscore, $t1, $t2, $finfo]);
275 }
276 } # line parsing loop
277
278 flush_gbuf() if $curqry;
279 select(STDOUT);
280 close(OUTFILE) if ($outfile);
281 close(FMAP) if ($fmapout);
282 # END here
283
284 #================ SUBROUTINES ============
285
286 sub flush_gbuf {
287 return unless $gffid;
288 check_gff();
289 ($gffid, $curqry, $curqlen, $covlen, $curqstart, $curqend, $curcov, $curpid, $use_tcoords)=
290 (undef, undef, 0, 0, 0, 0, 0, 0, 0);
291 ($gchr, $gtrack, $gstart, $gend, $gscore, $gstrand, $gname, $ginfo)=(undef) x 8;
292 @exd=();
293 @cds=();
294 $eScore=0;
295 }
296
297 sub check_gff {
298 #die ("Error at calling flush_gbuf(): current length must NOT be zero!\n") unless $curqlen>0;
299 #$covlen=$covlen/3 if $pmap;
300 #my $cov=$curqlen ? ($covlen*100.00)/$curqlen : 0;
301 #return if ($mincov && $cov<$mincov); #failed coverage condition
302
303 my $hasCDSonly; # for pblat or some GTF file, we may have no exon segments
304 if (@exd==0) {
305 @exd=sort { $main::a->[0] <=> $main::b->[0] } @cds;
306 $hasCDSonly=1;
307 @cds=();
308 }
309 else {
310 @exd = sort { $main::a->[0] <=> $main::b->[0] } @exd;
311 @cds = sort { $main::a->[0] <=> $main::b->[0] } @cds;
312 }
313 my $prev_end=0;
314 #my $covscore=0;
315 my $i=0;
316 my $numexons=0;
317 my @fexd; #filtered set of exons
318 my $qcovlen=0; #total aligned length
319 my $gcovlen=0; #covered length (in nucleotides)
320 my $covscore=0;
321 foreach my $x (@exd) {
322 $i++;
323 my $xpid = ($$x[3]>=10 && $$x[3]<=100.0)? $$x[3] : 0;
324 next if ($xpid && $xpid<$xminpid && (($i>1 && $i<@exd) || @exd==1)); #skip internal exons with lower pid
325 if ($prev_end>0) {
326 my $intronlen=$$x[0]-$prev_end;
327 return if $intronlen>$maxintron;
328 }
329 $prev_end=$$x[1];
330 $covscore+=($$x[1]-$$x[0]+1)*$xpid;
331 $gcovlen+=($$x[1]-$$x[0]+1);
332 $qcovlen+=($$x[5]-$$x[4]+1) if $$x[5]>0;
333 push(@fexd, $x);
334 $numexons++;
335 }
336 return if @fexd==0;
337 if ($g_range) {
338 if ($xrange) {
339 #everything that intersects given range
340 return if ($gstart>$maxcoord || $gend<$mincoord);
341 }
342 else {
343 return unless ($gstart>=$mincoord && $gend<=$maxcoord);
344 }
345 }
346 return if ($fltstrand && $gstrand ne $fltstrand);
347 my $totpid=sprintf('%.2f',$covscore/$gcovlen);
348 my $qlen=$curqlen; #query length, in nucleotides
349 my $qcov = ($qcovlen>5 && $qlen>0) ? ($qcovlen*100.00)/$qlen : $curcov;
350 #print STDERR "=====> gffID: $gffid qcov=$qcov\n";
351 return if ($numexons<1 || ($totpid>10 && $totpid<$minpid) || ($qcov && $qcov<$mincov));
352 #my ($parent)=($ginfo=~m/\bID\s*=\s*([^;]+)/);
353 my $parent=$gffid;
354 if ($curqry) {
355 my $q=$dups{$curqry} || $curqry;
356 my $uid="$gchr|$q|$gstart|$gend|$qcov";
357 return if exists($uniq{$uid});
358 $uniq{$uid}=1;
359 }
360 if ($addCounter) {
361 if ($fullRename) {
362 $globalCounter++;
363 $parent=$addCounter.$globalCounter;
364 }
365 else {
366 die("Error: no curqry available for gff ID: $gffid!\n") unless $curqry;
367 my $mnum = ++$n{$curqry};
368 $parent=$curqry.'.'.$addCounter.$mnum;
369 }
370
371 }
372 my $gffattrs="ID=$parent";
373 my $gffname=$gname || $parent;
374 if ($stripAttrs || $resetName) {
375 $gffname=$parent;
376 }
377 $gffattrs.=";Name=$gffname"; #just use a name there so argo can show it
378
379 if ($use_tcoords) {
380 my @td=sort { $main::a->[4] <=> $main::b->[4] } @fexd;
381 $curqstart=$td[0]->[4];
382 $curqend=$td[-1]->[5];
383 }
384 if ($qcov>1) {
385 $qcov=100 if $qcov>100;
386 $gffattrs.=';cov='.sprintf('%.1f',$qcov) unless $stripAttrs;
387 }
388 unless ($totpid>0) {
389 $totpid=$gscore if $gscore>10 && $gscore<=100.0;
390 }
391 unless ($covscore) {
392 my $rqlen=$qlen;
393 $rqlen*=3 if $pmap;
394 $covscore=(($rqlen*$qcov)/100.00)*$totpid;
395 }
396 unless ($stripAttrs) {
397 $gffattrs.=';pid='.sprintf('%.2f',$totpid) if $totpid>2; # unless $ginfo=~m/Identity=/;
398 $gffattrs.=';covscore='.sprintf('%d',$covscore) if $covscore>10;
399 #$ginfo.=';score='.sprintf('%d',($eScore/100.00)) unless $ginfo=~m/score=/i;
400 if ($curqstart && $curqend) {
401 $gffattrs.=";qreg=$curqstart-$curqend";
402 $gffattrs.="|$curqlen" if $curqlen;
403 }
404 else {
405 $gffattrs.=';qlen='.$curqlen if $curqlen;
406 }
407 }
408 my ($gffdescr)=($ginfo=~m/\b(?:descr|info)=\"([^"]+)/);
409 unless ($stripAttrs) {
410 $ginfo=~s/;$//;
411 $ginfo=~s/^;//;
412 if ($curdescr) {
413 $curdescr =~ tr/"\t;=/' .:/s; #"
414 $curdescr=~s/[\.\;\, ]+$//;
415 $ginfo=~s/\b(?:descr|info)=\"[^"]+\"\s*;?//i;
416 if ($ginfo) {
417 $ginfo.=';descr="'.$curdescr.'"';
418 }
419 else { $ginfo=';descr="'.$curdescr.'"'; }
420 }
421 $gffattrs.=";$ginfo" if $ginfo;
422 }
423 $gffattrs=~tr/; /; /s;
424
425 my $gfftrack= $ntrack || $gtrack;
426 print join("\t", $gchr, $gfftrack, 'mRNA', $gstart, $gend, $gscore, $gstrand, '.', $gffattrs)."\n" unless $GTF;
427 # exons:
428 my $fonly=$outFeat;
429 if ($outFeat) {
430 $fonly=$outFeat;
431 }
432 else {
433 $fonly= $hasCDSonly ? 'CDS':'exon';
434 }
435 my $wroteExons=0;
436 if (!$inFeat || ($inFeat eq 'exon') || $hasCDSonly) {
437 $wroteExons=1;
438 foreach my $x (@fexd) {
439 my $xinfo="Parent=$parent";
440 unless ($stripAttrs) {
441 $xinfo.=";qreg=$$x[4]-$$x[5]" if ($$x[4]>0);
442 $$x[6]=~s/^;+//;
443 $xinfo.=";$$x[6]" if $$x[6];
444 }
445 print join("\t", $gchr, $gfftrack, $fonly, $$x[0], $$x[1], $$x[3], $gstrand, $$x[2],$xinfo)."\n";
446 }
447 }
448 if ($fmapout) {
449 my $descr=$curdescr || $gffdescr;
450 print FMAP join("\t", $gchr, $gstrand, $gstart, $gend, $curqry, sprintf('%d',$covscore),
451 sprintf('%.1f',$totpid), sprintf('%.1f',$qcov), "$curqstart-$curqend|$curqlen", $descr)."\n";
452 }
453 return if ($outFeat && $wroteExons);
454 $fonly=$outFeat || 'CDS';
455 if (@cds>0 && (!$inFeat || ($inFeat eq 'CDS'))) {
456 foreach my $x (@cds) {
457 my $xinfo="Parent=$parent";
458 unless ($stripAttrs) {
459 $xinfo.=";qreg=$$x[4]-$$x[5]" if ($$x[4]>0);
460 $$x[6]=~s/^;+//;
461 $xinfo.=";$$x[6]" if $$x[6];
462 }
463 print join("\t", $gchr, $gfftrack, $fonly, $$x[0], $$x[1], $$x[3], $gstrand, $$x[2], $xinfo)."\n";
464 }
465 }
466 }#sub flush_gbuf()

Properties

Name Value
svn:executable *