1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
gffmanip [-v] [-o outrecords.txt] [{-r|-c|-e} <rangedata>] [-C] [-t <track>]\ |
8 |
[-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \ |
9 |
[-k <attrlist>] [-l [{-T|-L|-l <attrlist>}] [<gff\/gtf_file>] |
10 |
|
11 |
Filtering options: |
12 |
-C output only those transcripts having CDS features |
13 |
-f output only the features matching any of the strings in the given |
14 |
comma-delimited list (e.g. -f 'mRNA,exon,CDS') |
15 |
-v show records that do NOT match the given filtering options (like grep's -v) |
16 |
-r show only records overlapping any of the intervals given in <rangedata>, |
17 |
which has the format: |
18 |
[<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]] |
19 |
-c same as -r, but only shows records fully contained in the interval(s) given |
20 |
-e outputs records whose exon\/CDS segments overlap the interval(s) given |
21 |
-i only shows records whose ID match any of the entries in the file <ids.txt>; |
22 |
unless another attribute is given as a prefix (<attr>:), the ID attribute |
23 |
is used for GFF3 input, and transcript_id for GTF |
24 |
-a only shows records having the <attr> attribute with the value <value>; |
25 |
multiple attribute\/value pairs can be given (comma delimited) and a record |
26 |
will be shown if there is at least one attribute match |
27 |
|
28 |
Output options: |
29 |
-K keep all attributes ; by default only the core GFF\/GTF attributes are |
30 |
shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name) |
31 |
-k keep only the specified non-core attributes in the output records |
32 |
-T output GTF format (default is GFF3) |
33 |
-L output only the list of transcript IDs found in the file |
34 |
(and matching the filtering options) |
35 |
-l output the value of all attributes in <attrlist> for each record |
36 |
(tab delimited if more than one attribute is given) |
37 |
-t replace the 2nd column with the given <track> text |
38 |
/; |
39 |
umask 0002; |
40 |
getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') || die($usage."\n"); |
41 |
die("$usage\n") unless @ARGV>0; |
42 |
die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1; |
43 |
my $input_gff=$ARGV[0]; |
44 |
my $outfile=$Getopt::Std::opt_o; |
45 |
my %ignoredFeatures; |
46 |
@ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=(); |
47 |
my %oattrs; #hash with attrs to print |
48 |
@oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=(); |
49 |
my %attrflt; # attribute=>value filter |
50 |
my %idflt; # hash with IDs to keep |
51 |
my %atab; # hash with attribute names whose values will be listed as tab delimited |
52 |
my @atab_cols; |
53 |
if ($outfile) { |
54 |
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
55 |
select(OUTF); |
56 |
} |
57 |
# -- |
58 |
my ($fltinvert, $printGTF, $out_track)= |
59 |
($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t); |
60 |
my $only_ifCDS=$Getopt::Std::opt_v; |
61 |
my ($idfile, $avflt, $featlist, $oattrlist)= |
62 |
($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k); |
63 |
my $keepAll=$Getopt::Std::opt_K || $Getopt::Std::opt_A || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all'); |
64 |
my ($r_range, $c_range, $e_range)= |
65 |
($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e); |
66 |
my $check_range; |
67 |
if ($r_range) { |
68 |
$check_range=$r_range; |
69 |
die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range || $e_range); |
70 |
} |
71 |
elsif ($c_range) { |
72 |
$check_range=$c_range; |
73 |
die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $e_range); |
74 |
} |
75 |
elsif ($e_range) { |
76 |
$check_range=$e_range; |
77 |
die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $c_range); |
78 |
} |
79 |
my ($flt_chr, $flt_strand); |
80 |
my @flt_intv; |
81 |
if ($check_range) { |
82 |
($flt_chr, my $ck_rlst)=split(/\:/,$check_range); |
83 |
#die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst; |
84 |
my $flt_strand=substr($flt_chr,0,1); |
85 |
if ($flt_strand eq '-' || $flt_strand eq '+') { |
86 |
substr($flt_chr,0,1)=''; |
87 |
} |
88 |
else { |
89 |
$flt_strand=undef; |
90 |
my $e=chop($flt_chr); |
91 |
if ($e eq '-' || $e eq '+') { |
92 |
$flt_strand=$e; |
93 |
} |
94 |
else { $flt_chr.=$e; } |
95 |
} #no strand |
96 |
my @flt_intv; |
97 |
if ($ck_rlst) { |
98 |
my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst)); |
99 |
foreach my $d (@rdata) { |
100 |
($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1]; |
101 |
} |
102 |
@flt_intv = sort { $a->[0] <=> $b->[0] } @rdata; |
103 |
} |
104 |
} |
105 |
#my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1]; |
106 |
|
107 |
$oattrlist='' if $keepAll; |
108 |
my $tab_attrs=$Getopt::Std::opt_l; |
109 |
$tab_attrs='ID' if $Getopt::Std::opt_L; |
110 |
if ($tab_attrs) { |
111 |
@atab_cols=split(/\,/, $tab_attrs); |
112 |
@atab{@atab_cols}=(); |
113 |
} |
114 |
my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id |
115 |
if ($idfile) { |
116 |
unless (-f $idfile) { |
117 |
my ($a,$f)=(split(/\:/,$idfile)); |
118 |
($idfile, $idfileattr)=($f,$a) if $f; |
119 |
} |
120 |
my $idf; |
121 |
if ($idfile eq '-') { |
122 |
open($idf, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n"); |
123 |
} |
124 |
else { |
125 |
open($idf, $idfile) || die("Error: cannot open $idfile $!\n"); |
126 |
} |
127 |
while (<$idf>) { |
128 |
my ($id)=(m/(\S+)/); |
129 |
$idflt{$id}=1 if $id; |
130 |
} |
131 |
close($idf); |
132 |
} |
133 |
if ($avflt) { |
134 |
my @avl=split(/\,/, $avflt); |
135 |
foreach my $avpair (@avl) { |
136 |
my ($attr, $value)=split(/\s*=\s*/,$avpair,2); |
137 |
$value=~s/[" ]+$//;$value=~s/^[" ]+//; |
138 |
$attrflt{$attr}=$value; |
139 |
} |
140 |
} |
141 |
my %featflt; # feature list filter -- if $featlist was given |
142 |
if ($featlist) { |
143 |
#add to the list of attributes to be kept |
144 |
my @fl=split(/\,/, $featlist); |
145 |
@featflt{@fl}=(); # this is actually an input filter |
146 |
} |
147 |
if ($oattrlist) { |
148 |
#add to the list of attributes to be kept |
149 |
my @al=split(/\,/, $oattrlist); |
150 |
@oattrs{@al}=(); |
151 |
} |
152 |
# ---- |
153 |
my %gffrecs; # recID => [ chr, strand, feat_type, \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ] |
154 |
# 0 1 2 3 4 5 6 7 8 9 10 , 11, 12 |
155 |
# recID has the prefix '<chr>|' which should be removed before output |
156 |
my $gffh; |
157 |
if ($input_gff eq '-') { |
158 |
open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN $!\n"); |
159 |
} |
160 |
else { |
161 |
open($gffh, $input_gff) || die("Error opening file $input_gff $!\n"); |
162 |
} |
163 |
|
164 |
loadGff($gffh, \%gffrecs); |
165 |
my @sorted_recs=sort sortByLoc keys(%gffrecs); |
166 |
processGffRecs(\%gffrecs, \@sorted_recs); |
167 |
|
168 |
# -- |
169 |
if ($outfile) { |
170 |
select(STDOUT); |
171 |
close(OUTF); |
172 |
} |
173 |
|
174 |
#************ Subroutines ************** |
175 |
sub sortByLoc { |
176 |
my $da=$gffrecs{$a}; |
177 |
my $db=$gffrecs{$b}; |
178 |
if ($$da[0] eq $$db[0]) { |
179 |
return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ; |
180 |
} |
181 |
else { return $$da[0] cmp $$db[0] ; } |
182 |
} |
183 |
|
184 |
sub checkOvlExons { |
185 |
my ($a, $b, $rx)=@_; |
186 |
return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain |
187 |
foreach my $x (@$rx) { |
188 |
return 1 if ($a<=$$x[1] && $b>=$$x[0]); |
189 |
return 0 if $b<$$x[0]; |
190 |
} |
191 |
} |
192 |
|
193 |
sub checkWithinExons { |
194 |
my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval |
195 |
return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain |
196 |
foreach my $x (@$rx) { |
197 |
return 1 if ($a>=$$x[0] && $b<=$$x[1]); |
198 |
return 0 if $b<$$x[0]; |
199 |
} |
200 |
} |
201 |
|
202 |
|
203 |
sub loadGff { |
204 |
my ($gffhandle, $recs)=@_; |
205 |
while (<$gffhandle>) { |
206 |
next if m/^\s*#/; |
207 |
chomp; |
208 |
my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
209 |
next unless $fstart>1 && $lnum; |
210 |
$track=$out_track if $out_track; |
211 |
next if exists($ignoredFeatures{lc($f)}); |
212 |
$f='exon' if $f=~m/exon/i; |
213 |
$f='CDS' if $f=~m/^cds$/i; |
214 |
my $rejected; |
215 |
$rejected=1 if ($featlist && not exists($featflt{$f})); |
216 |
#next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally |
217 |
my $gff3_ID; |
218 |
my $gff3_Parent; |
219 |
my ($gname,$tdescr); |
220 |
my %attrs; |
221 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
222 |
#$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes |
223 |
my @av=split(/\s*\;\s*/,$lnum); |
224 |
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
225 |
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
226 |
my $isGFF3 = ($gff3_ID || $gff3_Parent); |
227 |
if ($isGFF3) { # GFF format |
228 |
$gff3_ID=~tr/"//d; #" |
229 |
$gff3_Parent=~tr/"//d; #" |
230 |
$gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features |
231 |
if ($gff3_ID && !$gff3_Parent) { #top level feature |
232 |
foreach my $a (@av) { |
233 |
my ($attr, $value)=split(/\s*=\s*/,$a,2); |
234 |
$value=~s/[" ]+$//;$value=~s/^[" ]+//; |
235 |
$attrs{$attr}=$value; |
236 |
} |
237 |
if ($f=~m/RNA/i || $f=~/gene/) { |
238 |
# try to parse the description, if any |
239 |
|
240 |
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
241 |
$tdescr=$1; |
242 |
} |
243 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
244 |
$tdescr=$1; |
245 |
} |
246 |
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
247 |
$gname=$1; |
248 |
} |
249 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
250 |
$gname=$1; |
251 |
} |
252 |
$tdescr='' if ($tdescr eq $gname); |
253 |
$gname='' if $gname eq $gff3_ID; |
254 |
} |
255 |
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr|$gff3_ID"})); |
256 |
my $recID="$chr|$gff3_ID"; |
257 |
$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore]; |
258 |
next; |
259 |
} # parent/top-level feature |
260 |
} #GFF |
261 |
else { #GTF format |
262 |
if ($f eq 'transcript') { # GTF with parent 'transcript' feature |
263 |
foreach my $a (@av) { |
264 |
my ($attr, $value)=split(/\s+"/,$a,2); #" |
265 |
$value=~s/[" ]+$//; |
266 |
if ($attr eq 'transcript_id') { |
267 |
$attr='ID'; |
268 |
} |
269 |
elsif ($attr eq 'gene_id') { |
270 |
$attr='Name'; |
271 |
} |
272 |
$attrs{$attr}=$value; |
273 |
} |
274 |
my $recID=$attrs{'ID'}; |
275 |
die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID; |
276 |
die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr|$recID"})); |
277 |
$recID=$chr.'|'.$recID; |
278 |
$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ]; |
279 |
next; |
280 |
} # parent 'transcript' feature in GTF |
281 |
} |
282 |
# -------------- exon/CDS line here: |
283 |
next if ($featlist && !exists($featflt{$f})); |
284 |
my $recID; |
285 |
($gname, $tdescr)=(); |
286 |
if ($isGFF3) { |
287 |
$recID=$gff3_Parent; |
288 |
} |
289 |
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
290 |
$recID=$1; |
291 |
$recID=~tr/"//d; #" |
292 |
} |
293 |
elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
294 |
$recID=$chr.'.jsm.'.$lnum; |
295 |
$gff3_Parent=$recID; |
296 |
$isGFF3=1; |
297 |
$f='CDS'; |
298 |
} |
299 |
else { |
300 |
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
301 |
} |
302 |
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
303 |
$gname=$1; |
304 |
$gname=~tr/"//d; #" |
305 |
} |
306 |
$tdescr='' if index($recID, $tdescr)>=0; |
307 |
$gname='' if index($recID, $gname)>=0; |
308 |
$recID=$chr.'|'.$recID; |
309 |
my $ld = $recs->{$recID}; |
310 |
if ($ld) { #existing entry |
311 |
my $i=($f eq 'CDS') ? 7 : 6; |
312 |
my ($lstart, $lend)=($$ld[4], $$ld[5]); |
313 |
$$ld[4]=$fstart if $fstart<$lstart; |
314 |
$$ld[5]=$fend if $fend>$lend; |
315 |
push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]); |
316 |
if ($f ne 'CDS') { |
317 |
if ($$ld[11] && $$ld[11] ne $f) { |
318 |
die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n"); |
319 |
} |
320 |
$$ld[11]=$f; |
321 |
} |
322 |
} |
323 |
else { # first time seeing this locus/gene |
324 |
# get the attributes from this first exon line |
325 |
if ($gff3_Parent) { #has GFF3 Parent |
326 |
foreach my $a (@av) { |
327 |
my ($attr, $value)=split(/\s*=\s*/,$a,2); #" |
328 |
$attr='ID' if $attr eq 'Parent'; |
329 |
$value=~s/[" ]+$//;$value=~s/^[" ]+//; |
330 |
} |
331 |
} |
332 |
else { # GTF |
333 |
foreach my $a (@av) { |
334 |
my ($attr, $value)=split(/\s+"/,$a,2); #" |
335 |
$value=~s/[" ]+$//; |
336 |
if ($attr eq 'transcript_id') { |
337 |
$attr='ID'; |
338 |
} |
339 |
elsif ($attr eq 'gene_id') { |
340 |
$attr='Name'; |
341 |
} |
342 |
$attrs{$attr}=$value unless $attr=~m/^exon/;; |
343 |
} |
344 |
} |
345 |
$recs->{$recID} = ($f eq 'CDS') ? |
346 |
[$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] : |
347 |
[$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [], $isGFF3, $rejected, $track, $f,'.' ] ; |
348 |
# 0 1 2 3 4 5 6(exons) 7 (CDS) 8 9 10 |
349 |
} |
350 |
} |
351 |
close($gffh); |
352 |
} |
353 |
|
354 |
sub processGffRecs { |
355 |
#return if keys(%recs)==0; |
356 |
my ($recs, $rlist)=@_; |
357 |
my @recs_keys; |
358 |
unless ($rlist) { |
359 |
@recs_keys=keys(%$recs); |
360 |
$rlist=\@recs_keys; |
361 |
} |
362 |
foreach my $recid (@$rlist) { |
363 |
my $td=$$recs{$recid}; |
364 |
# 0 1 2 3 4 5 6 7 8 9 10 11 12 |
365 |
my ($chr, $strand, $feature, $attrs, $fstart, $fend, $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td; |
366 |
next if ($rej && !$fltinvert); |
367 |
next if ($fltinvert && $featlist && !$rej); |
368 |
# my ($mstart,$mend)=($fstart, $fend); |
369 |
my $CDSonly=0; # set to 1 if only CDS segments were given |
370 |
my $hasCDS=0; |
371 |
my @ex; |
372 |
my @cds; |
373 |
#some records might lack exons, but have only CDS segments (e.g. mitochondrial genes) |
374 |
if (@$er<1 && @$cr>0) { |
375 |
@ex = sort { $a->[0] <=> $b->[0] } @$cr; |
376 |
@cds=@ex; |
377 |
$CDSonly=1; |
378 |
$hasCDS=1; |
379 |
} |
380 |
else { |
381 |
@ex = sort { $a->[0] <=> $b->[0] } @$er; |
382 |
if (@$cr>0) { # sort cds segments too |
383 |
@cds= sort { $a->[0] <=> $b->[0] } @$cr; |
384 |
$hasCDS=1; |
385 |
} |
386 |
} |
387 |
# -------------- |
388 |
# get the more accurate version of the start-end coords for the feature |
389 |
my $covlen=0; |
390 |
# map { $covlen+=$_->[1]-$_->[0]+1 } @ex; |
391 |
my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); |
392 |
my $gffid=$recid; |
393 |
substr($gffid, 0, length($chr)+1)=''; |
394 |
die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n") |
395 |
unless ($gffid eq $attrs->{'ID'}); |
396 |
my $doprint=1; |
397 |
#check all the filters |
398 |
$doprint=0 if ($only_ifCDS && $hasCDS==0); |
399 |
if ($doprint && $idfile) { |
400 |
if ($idfileattr) { |
401 |
$doprint=0 unless exists($idflt{$attrs->{$idfileattr}}); |
402 |
} |
403 |
else { |
404 |
$doprint=0 unless exists($idflt{$gffid}); |
405 |
} |
406 |
} |
407 |
if ($doprint && $avflt) { |
408 |
my $avfound=0; |
409 |
foreach my $a (keys(%attrflt)) { |
410 |
if ($attrflt{$a} eq $attrs->{$a}) { |
411 |
$avfound=1; |
412 |
last; |
413 |
} |
414 |
} |
415 |
$doprint=0 unless $avfound; |
416 |
} |
417 |
|
418 |
|
419 |
if ($doprint && $flt_chr && $flt_chr ne $chr) { |
420 |
$doprint=0; |
421 |
} |
422 |
if ($doprint) { |
423 |
if ($flt_strand && $flt_strand ne $strand) { |
424 |
$doprint=0; |
425 |
} |
426 |
} |
427 |
if ($doprint && @flt_intv>0) { |
428 |
if ($r_range) { #t span overlap any range intervals |
429 |
$doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv); |
430 |
} |
431 |
elsif ($c_range) { #t span contained in any of the intervals |
432 |
$doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv); |
433 |
} |
434 |
elsif ($e_range) { # each exon checked for overlap of any interval |
435 |
for my $ed (@ex) { |
436 |
if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) { |
437 |
$doprint=1; |
438 |
last; |
439 |
} |
440 |
} |
441 |
} |
442 |
} |
443 |
|
444 |
$doprint = ! $doprint if $fltinvert; |
445 |
next unless $doprint; |
446 |
#filter passed, print output |
447 |
if ($tab_attrs) { |
448 |
my @od; |
449 |
foreach my $a (@atab_cols) { |
450 |
if (uc($a) eq 'ID') { |
451 |
push(@od, $gffid); |
452 |
next; |
453 |
} |
454 |
push(@od, $attrs->{$a}); |
455 |
} |
456 |
print join("\t",@od)."\n"; |
457 |
next; |
458 |
} |
459 |
#the core attributes ID, Name must be there already |
460 |
unless (exists($attrs->{Name})) { |
461 |
foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) { |
462 |
if (exists($attrs->{$name})) { |
463 |
$attrs->{Name}=$attrs->{$name}; |
464 |
last; |
465 |
} |
466 |
} |
467 |
} |
468 |
my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name})); |
469 |
my ($gene_name, $gene, $locus); |
470 |
unless ($keepAll) { |
471 |
if (exists($attrs->{gene_name})) { |
472 |
$gene_name=delete($attrs->{gene_name}); |
473 |
$gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name})); |
474 |
} |
475 |
if (exists($attrs->{gene})) { |
476 |
$gene=delete($attrs->{gene}); |
477 |
$gene=undef if ($gene eq $tname || $gene eq $gene_name) && !exists($oattrs{gene}); |
478 |
} |
479 |
if (exists($attrs->{locus})) { |
480 |
$locus=delete($attrs->{locus}); |
481 |
$locus=undef if ($locus eq $gene_name || $locus eq $gene_name) && !exists($oattrs{locus}); |
482 |
} |
483 |
} |
484 |
my $tattrs; |
485 |
if ($printGTF) { #print GTF |
486 |
$tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";'; |
487 |
$tattrs.=' gene_name "'.$gene_name.'";' if $gene_name; |
488 |
$tattrs.=' gene "'.$gene.'";' if $gene; |
489 |
$tattrs.=' locus "'.$locus.'";' if $locus; |
490 |
if ($keepAll) { |
491 |
foreach my $attr (keys(%$attrs)) { |
492 |
$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";'; |
493 |
} |
494 |
} |
495 |
else { # only a subset of attributes will be shown |
496 |
foreach my $attr (keys(%$attrs)) { |
497 |
next unless exists($oattrs{$attr}); |
498 |
$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";'; |
499 |
} |
500 |
} |
501 |
} |
502 |
else { #print GFF3 |
503 |
$tattrs='Parent='.$tid; |
504 |
my $pattrs="ID=$tid;Name=$tname"; |
505 |
$pattrs.='gene_name='.$gene_name.';' if $gene_name; |
506 |
$pattrs.='gene='.$gene.';' if $gene; |
507 |
$pattrs.='locus='.$locus.';' if $locus; |
508 |
if ($keepAll) { |
509 |
foreach my $attr (keys(%$attrs)) { |
510 |
my $val=$attrs->{$attr}; |
511 |
$pattrs.="$attr=$val;"; |
512 |
} |
513 |
} |
514 |
else { # only a subset of attributes will be shown |
515 |
foreach my $attr (keys(%$attrs)) { |
516 |
next unless exists($oattrs{$attr}); |
517 |
my $val=$attrs->{$attr}; |
518 |
$pattrs.="$attr=$val;"; |
519 |
} |
520 |
} |
521 |
print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n"; |
522 |
} |
523 |
if ($CDSonly==0) { #write exons only when found in the input |
524 |
foreach my $ed (@ex) { |
525 |
print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n"; |
526 |
} |
527 |
} |
528 |
foreach my $cd (@cds) { |
529 |
print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n"; |
530 |
} |
531 |
} #for each stored transcript |
532 |
} |