ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gffmanip
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 18716 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gffmanip [-v] [-o outrecords.txt] [{-r|-c|-e} <rangedata>] [-C] [-t <track>]\
8 [-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \
9 [-k <attrlist>] [-l [{-T|-L|-l <attrlist>}] [<gff\/gtf_file>]
10
11 Filtering options:
12 -C output only those transcripts having CDS features
13 -f output only the features matching any of the strings in the given
14 comma-delimited list (e.g. -f 'mRNA,exon,CDS')
15 -v show records that do NOT match the given filtering options (like grep's -v)
16 -r show only records overlapping any of the intervals given in <rangedata>,
17 which has the format:
18 [<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]]
19 -c same as -r, but only shows records fully contained in the interval(s) given
20 -e outputs records whose exon\/CDS segments overlap the interval(s) given
21 -i only shows records whose ID match any of the entries in the file <ids.txt>;
22 unless another attribute is given as a prefix (<attr>:), the ID attribute
23 is used for GFF3 input, and transcript_id for GTF
24 -a only shows records having the <attr> attribute with the value <value>;
25 multiple attribute\/value pairs can be given (comma delimited) and a record
26 will be shown if there is at least one attribute match
27
28 Output options:
29 -K keep all attributes ; by default only the core GFF\/GTF attributes are
30 shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name)
31 -k keep only the specified non-core attributes in the output records
32 -T output GTF format (default is GFF3)
33 -L output only the list of transcript IDs found in the file
34 (and matching the filtering options)
35 -l output the value of all attributes in <attrlist> for each record
36 (tab delimited if more than one attribute is given)
37 -t replace the 2nd column with the given <track> text
38 /;
39 umask 0002;
40 getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') || die($usage."\n");
41 die("$usage\n") unless @ARGV>0;
42 die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1;
43 my $input_gff=$ARGV[0];
44 my $outfile=$Getopt::Std::opt_o;
45 my %ignoredFeatures;
46 @ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=();
47 my %oattrs; #hash with attrs to print
48 @oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=();
49 my %attrflt; # attribute=>value filter
50 my %idflt; # hash with IDs to keep
51 my %atab; # hash with attribute names whose values will be listed as tab delimited
52 my @atab_cols;
53 if ($outfile) {
54 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
55 select(OUTF);
56 }
57 # --
58 my ($fltinvert, $printGTF, $out_track)=
59 ($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t);
60 my $only_ifCDS=$Getopt::Std::opt_v;
61 my ($idfile, $avflt, $featlist, $oattrlist)=
62 ($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k);
63 my $keepAll=$Getopt::Std::opt_K || $Getopt::Std::opt_A || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all');
64 my ($r_range, $c_range, $e_range)=
65 ($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e);
66 my $check_range;
67 if ($r_range) {
68 $check_range=$r_range;
69 die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range || $e_range);
70 }
71 elsif ($c_range) {
72 $check_range=$c_range;
73 die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $e_range);
74 }
75 elsif ($e_range) {
76 $check_range=$e_range;
77 die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $c_range);
78 }
79 my ($flt_chr, $flt_strand);
80 my @flt_intv;
81 if ($check_range) {
82 ($flt_chr, my $ck_rlst)=split(/\:/,$check_range);
83 #die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst;
84 my $flt_strand=substr($flt_chr,0,1);
85 if ($flt_strand eq '-' || $flt_strand eq '+') {
86 substr($flt_chr,0,1)='';
87 }
88 else {
89 $flt_strand=undef;
90 my $e=chop($flt_chr);
91 if ($e eq '-' || $e eq '+') {
92 $flt_strand=$e;
93 }
94 else { $flt_chr.=$e; }
95 } #no strand
96 my @flt_intv;
97 if ($ck_rlst) {
98 my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst));
99 foreach my $d (@rdata) {
100 ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
101 }
102 @flt_intv = sort { $a->[0] <=> $b->[0] } @rdata;
103 }
104 }
105 #my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1];
106
107 $oattrlist='' if $keepAll;
108 my $tab_attrs=$Getopt::Std::opt_l;
109 $tab_attrs='ID' if $Getopt::Std::opt_L;
110 if ($tab_attrs) {
111 @atab_cols=split(/\,/, $tab_attrs);
112 @atab{@atab_cols}=();
113 }
114 my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id
115 if ($idfile) {
116 unless (-f $idfile) {
117 my ($a,$f)=(split(/\:/,$idfile));
118 ($idfile, $idfileattr)=($f,$a) if $f;
119 }
120 my $idf;
121 if ($idfile eq '-') {
122 open($idf, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n");
123 }
124 else {
125 open($idf, $idfile) || die("Error: cannot open $idfile $!\n");
126 }
127 while (<$idf>) {
128 my ($id)=(m/(\S+)/);
129 $idflt{$id}=1 if $id;
130 }
131 close($idf);
132 }
133 if ($avflt) {
134 my @avl=split(/\,/, $avflt);
135 foreach my $avpair (@avl) {
136 my ($attr, $value)=split(/\s*=\s*/,$avpair,2);
137 $value=~s/[" ]+$//;$value=~s/^[" ]+//;
138 $attrflt{$attr}=$value;
139 }
140 }
141 my %featflt; # feature list filter -- if $featlist was given
142 if ($featlist) {
143 #add to the list of attributes to be kept
144 my @fl=split(/\,/, $featlist);
145 @featflt{@fl}=(); # this is actually an input filter
146 }
147 if ($oattrlist) {
148 #add to the list of attributes to be kept
149 my @al=split(/\,/, $oattrlist);
150 @oattrs{@al}=();
151 }
152 # ----
153 my %gffrecs; # recID => [ chr, strand, feat_type, \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ]
154 # 0 1 2 3 4 5 6 7 8 9 10 , 11, 12
155 # recID has the prefix '<chr>|' which should be removed before output
156 my $gffh;
157 if ($input_gff eq '-') {
158 open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN $!\n");
159 }
160 else {
161 open($gffh, $input_gff) || die("Error opening file $input_gff $!\n");
162 }
163
164 loadGff($gffh, \%gffrecs);
165 my @sorted_recs=sort sortByLoc keys(%gffrecs);
166 processGffRecs(\%gffrecs, \@sorted_recs);
167
168 # --
169 if ($outfile) {
170 select(STDOUT);
171 close(OUTF);
172 }
173
174 #************ Subroutines **************
175 sub sortByLoc {
176 my $da=$gffrecs{$a};
177 my $db=$gffrecs{$b};
178 if ($$da[0] eq $$db[0]) {
179 return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ;
180 }
181 else { return $$da[0] cmp $$db[0] ; }
182 }
183
184 sub checkOvlExons {
185 my ($a, $b, $rx)=@_;
186 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
187 foreach my $x (@$rx) {
188 return 1 if ($a<=$$x[1] && $b>=$$x[0]);
189 return 0 if $b<$$x[0];
190 }
191 }
192
193 sub checkWithinExons {
194 my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval
195 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
196 foreach my $x (@$rx) {
197 return 1 if ($a>=$$x[0] && $b<=$$x[1]);
198 return 0 if $b<$$x[0];
199 }
200 }
201
202
203 sub loadGff {
204 my ($gffhandle, $recs)=@_;
205 while (<$gffhandle>) {
206 next if m/^\s*#/;
207 chomp;
208 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
209 next unless $fstart>1 && $lnum;
210 $track=$out_track if $out_track;
211 next if exists($ignoredFeatures{lc($f)});
212 $f='exon' if $f=~m/exon/i;
213 $f='CDS' if $f=~m/^cds$/i;
214 my $rejected;
215 $rejected=1 if ($featlist && not exists($featflt{$f}));
216 #next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
217 my $gff3_ID;
218 my $gff3_Parent;
219 my ($gname,$tdescr);
220 my %attrs;
221 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
222 #$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes
223 my @av=split(/\s*\;\s*/,$lnum);
224 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
225 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
226 my $isGFF3 = ($gff3_ID || $gff3_Parent);
227 if ($isGFF3) { # GFF format
228 $gff3_ID=~tr/"//d; #"
229 $gff3_Parent=~tr/"//d; #"
230 $gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features
231 if ($gff3_ID && !$gff3_Parent) { #top level feature
232 foreach my $a (@av) {
233 my ($attr, $value)=split(/\s*=\s*/,$a,2);
234 $value=~s/[" ]+$//;$value=~s/^[" ]+//;
235 $attrs{$attr}=$value;
236 }
237 if ($f=~m/RNA/i || $f=~/gene/) {
238 # try to parse the description, if any
239
240 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
241 $tdescr=$1;
242 }
243 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
244 $tdescr=$1;
245 }
246 if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
247 $gname=$1;
248 }
249 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
250 $gname=$1;
251 }
252 $tdescr='' if ($tdescr eq $gname);
253 $gname='' if $gname eq $gff3_ID;
254 }
255 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr|$gff3_ID"}));
256 my $recID="$chr|$gff3_ID";
257 $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore];
258 next;
259 } # parent/top-level feature
260 } #GFF
261 else { #GTF format
262 if ($f eq 'transcript') { # GTF with parent 'transcript' feature
263 foreach my $a (@av) {
264 my ($attr, $value)=split(/\s+"/,$a,2); #"
265 $value=~s/[" ]+$//;
266 if ($attr eq 'transcript_id') {
267 $attr='ID';
268 }
269 elsif ($attr eq 'gene_id') {
270 $attr='Name';
271 }
272 $attrs{$attr}=$value;
273 }
274 my $recID=$attrs{'ID'};
275 die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID;
276 die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr|$recID"}));
277 $recID=$chr.'|'.$recID;
278 $recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ];
279 next;
280 } # parent 'transcript' feature in GTF
281 }
282 # -------------- exon/CDS line here:
283 next if ($featlist && !exists($featflt{$f}));
284 my $recID;
285 ($gname, $tdescr)=();
286 if ($isGFF3) {
287 $recID=$gff3_Parent;
288 }
289 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
290 $recID=$1;
291 $recID=~tr/"//d; #"
292 }
293 elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
294 $recID=$chr.'.jsm.'.$lnum;
295 $gff3_Parent=$recID;
296 $isGFF3=1;
297 $f='CDS';
298 }
299 else {
300 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
301 }
302 if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
303 $gname=$1;
304 $gname=~tr/"//d; #"
305 }
306 $tdescr='' if index($recID, $tdescr)>=0;
307 $gname='' if index($recID, $gname)>=0;
308 $recID=$chr.'|'.$recID;
309 my $ld = $recs->{$recID};
310 if ($ld) { #existing entry
311 my $i=($f eq 'CDS') ? 7 : 6;
312 my ($lstart, $lend)=($$ld[4], $$ld[5]);
313 $$ld[4]=$fstart if $fstart<$lstart;
314 $$ld[5]=$fend if $fend>$lend;
315 push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]);
316 if ($f ne 'CDS') {
317 if ($$ld[11] && $$ld[11] ne $f) {
318 die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n");
319 }
320 $$ld[11]=$f;
321 }
322 }
323 else { # first time seeing this locus/gene
324 # get the attributes from this first exon line
325 if ($gff3_Parent) { #has GFF3 Parent
326 foreach my $a (@av) {
327 my ($attr, $value)=split(/\s*=\s*/,$a,2); #"
328 $attr='ID' if $attr eq 'Parent';
329 $value=~s/[" ]+$//;$value=~s/^[" ]+//;
330 }
331 }
332 else { # GTF
333 foreach my $a (@av) {
334 my ($attr, $value)=split(/\s+"/,$a,2); #"
335 $value=~s/[" ]+$//;
336 if ($attr eq 'transcript_id') {
337 $attr='ID';
338 }
339 elsif ($attr eq 'gene_id') {
340 $attr='Name';
341 }
342 $attrs{$attr}=$value unless $attr=~m/^exon/;;
343 }
344 }
345 $recs->{$recID} = ($f eq 'CDS') ?
346 [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] :
347 [$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [], $isGFF3, $rejected, $track, $f,'.' ] ;
348 # 0 1 2 3 4 5 6(exons) 7 (CDS) 8 9 10
349 }
350 }
351 close($gffh);
352 }
353
354 sub processGffRecs {
355 #return if keys(%recs)==0;
356 my ($recs, $rlist)=@_;
357 my @recs_keys;
358 unless ($rlist) {
359 @recs_keys=keys(%$recs);
360 $rlist=\@recs_keys;
361 }
362 foreach my $recid (@$rlist) {
363 my $td=$$recs{$recid};
364 # 0 1 2 3 4 5 6 7 8 9 10 11 12
365 my ($chr, $strand, $feature, $attrs, $fstart, $fend, $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td;
366 next if ($rej && !$fltinvert);
367 next if ($fltinvert && $featlist && !$rej);
368 # my ($mstart,$mend)=($fstart, $fend);
369 my $CDSonly=0; # set to 1 if only CDS segments were given
370 my $hasCDS=0;
371 my @ex;
372 my @cds;
373 #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
374 if (@$er<1 && @$cr>0) {
375 @ex = sort { $a->[0] <=> $b->[0] } @$cr;
376 @cds=@ex;
377 $CDSonly=1;
378 $hasCDS=1;
379 }
380 else {
381 @ex = sort { $a->[0] <=> $b->[0] } @$er;
382 if (@$cr>0) { # sort cds segments too
383 @cds= sort { $a->[0] <=> $b->[0] } @$cr;
384 $hasCDS=1;
385 }
386 }
387 # --------------
388 # get the more accurate version of the start-end coords for the feature
389 my $covlen=0;
390 # map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
391 my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
392 my $gffid=$recid;
393 substr($gffid, 0, length($chr)+1)='';
394 die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n")
395 unless ($gffid eq $attrs->{'ID'});
396 my $doprint=1;
397 #check all the filters
398 $doprint=0 if ($only_ifCDS && $hasCDS==0);
399 if ($doprint && $idfile) {
400 if ($idfileattr) {
401 $doprint=0 unless exists($idflt{$attrs->{$idfileattr}});
402 }
403 else {
404 $doprint=0 unless exists($idflt{$gffid});
405 }
406 }
407 if ($doprint && $avflt) {
408 my $avfound=0;
409 foreach my $a (keys(%attrflt)) {
410 if ($attrflt{$a} eq $attrs->{$a}) {
411 $avfound=1;
412 last;
413 }
414 }
415 $doprint=0 unless $avfound;
416 }
417
418
419 if ($doprint && $flt_chr && $flt_chr ne $chr) {
420 $doprint=0;
421 }
422 if ($doprint) {
423 if ($flt_strand && $flt_strand ne $strand) {
424 $doprint=0;
425 }
426 }
427 if ($doprint && @flt_intv>0) {
428 if ($r_range) { #t span overlap any range intervals
429 $doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv);
430 }
431 elsif ($c_range) { #t span contained in any of the intervals
432 $doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv);
433 }
434 elsif ($e_range) { # each exon checked for overlap of any interval
435 for my $ed (@ex) {
436 if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) {
437 $doprint=1;
438 last;
439 }
440 }
441 }
442 }
443
444 $doprint = ! $doprint if $fltinvert;
445 next unless $doprint;
446 #filter passed, print output
447 if ($tab_attrs) {
448 my @od;
449 foreach my $a (@atab_cols) {
450 if (uc($a) eq 'ID') {
451 push(@od, $gffid);
452 next;
453 }
454 push(@od, $attrs->{$a});
455 }
456 print join("\t",@od)."\n";
457 next;
458 }
459 #the core attributes ID, Name must be there already
460 unless (exists($attrs->{Name})) {
461 foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) {
462 if (exists($attrs->{$name})) {
463 $attrs->{Name}=$attrs->{$name};
464 last;
465 }
466 }
467 }
468 my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name}));
469 my ($gene_name, $gene, $locus);
470 unless ($keepAll) {
471 if (exists($attrs->{gene_name})) {
472 $gene_name=delete($attrs->{gene_name});
473 $gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name}));
474 }
475 if (exists($attrs->{gene})) {
476 $gene=delete($attrs->{gene});
477 $gene=undef if ($gene eq $tname || $gene eq $gene_name) && !exists($oattrs{gene});
478 }
479 if (exists($attrs->{locus})) {
480 $locus=delete($attrs->{locus});
481 $locus=undef if ($locus eq $gene_name || $locus eq $gene_name) && !exists($oattrs{locus});
482 }
483 }
484 my $tattrs;
485 if ($printGTF) { #print GTF
486 $tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";';
487 $tattrs.=' gene_name "'.$gene_name.'";' if $gene_name;
488 $tattrs.=' gene "'.$gene.'";' if $gene;
489 $tattrs.=' locus "'.$locus.'";' if $locus;
490 if ($keepAll) {
491 foreach my $attr (keys(%$attrs)) {
492 $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
493 }
494 }
495 else { # only a subset of attributes will be shown
496 foreach my $attr (keys(%$attrs)) {
497 next unless exists($oattrs{$attr});
498 $tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";';
499 }
500 }
501 }
502 else { #print GFF3
503 $tattrs='Parent='.$tid;
504 my $pattrs="ID=$tid;Name=$tname";
505 $pattrs.='gene_name='.$gene_name.';' if $gene_name;
506 $pattrs.='gene='.$gene.';' if $gene;
507 $pattrs.='locus='.$locus.';' if $locus;
508 if ($keepAll) {
509 foreach my $attr (keys(%$attrs)) {
510 my $val=$attrs->{$attr};
511 $pattrs.="$attr=$val;";
512 }
513 }
514 else { # only a subset of attributes will be shown
515 foreach my $attr (keys(%$attrs)) {
516 next unless exists($oattrs{$attr});
517 my $val=$attrs->{$attr};
518 $pattrs.="$attr=$val;";
519 }
520 }
521 print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n";
522 }
523 if ($CDSonly==0) { #write exons only when found in the input
524 foreach my $ed (@ex) {
525 print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n";
526 }
527 }
528 foreach my $cd (@cds) {
529 print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n";
530 }
531 } #for each stored transcript
532 }

Properties

Name Value
svn:executable *