1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
use FindBin;use lib $FindBin::Bin; |
5 |
|
|
|
6 |
|
|
my $usage = q/Usage: |
7 |
|
|
gffmanip [-v] [-o outrecords.txt] [{-r|-c|-e} <rangedata>] [-C] [-t <track>]\ |
8 |
|
|
[-i [<attr>:]<ids.txt>] [-a <attr>=<value>[,...]] [-f <feature1>[,..]] \ |
9 |
|
|
[-k <attrlist>] [-l [{-T|-L|-l <attrlist>}] [<gff\/gtf_file>] |
10 |
|
|
|
11 |
|
|
Filtering options: |
12 |
|
|
-C output only those transcripts having CDS features |
13 |
|
|
-f output only the features matching any of the strings in the given |
14 |
|
|
comma-delimited list (e.g. -f 'mRNA,exon,CDS') |
15 |
|
|
-v show records that do NOT match the given filtering options (like grep's -v) |
16 |
|
|
-r show only records overlapping any of the intervals given in <rangedata>, |
17 |
|
|
which has the format: |
18 |
|
|
[<strand>]<chr>[:<start>-<end>[,<start2>-<end2>,...]] |
19 |
|
|
-c same as -r, but only shows records fully contained in the interval(s) given |
20 |
|
|
-e outputs records whose exon\/CDS segments overlap the interval(s) given |
21 |
|
|
-i only shows records whose ID match any of the entries in the file <ids.txt>; |
22 |
|
|
unless another attribute is given as a prefix (<attr>:), the ID attribute |
23 |
|
|
is used for GFF3 input, and transcript_id for GTF |
24 |
|
|
-a only shows records having the <attr> attribute with the value <value>; |
25 |
|
|
multiple attribute\/value pairs can be given (comma delimited) and a record |
26 |
|
|
will be shown if there is at least one attribute match |
27 |
|
|
|
28 |
|
|
Output options: |
29 |
|
|
-K keep all attributes ; by default only the core GFF\/GTF attributes are |
30 |
|
|
shown (transcript_id, gene_id, gene_name, ID, Parent, Name, gene_name) |
31 |
|
|
-k keep only the specified non-core attributes in the output records |
32 |
|
|
-T output GTF format (default is GFF3) |
33 |
|
|
-L output only the list of transcript IDs found in the file |
34 |
|
|
(and matching the filtering options) |
35 |
|
|
-l output the value of all attributes in <attrlist> for each record |
36 |
|
|
(tab delimited if more than one attribute is given) |
37 |
|
|
-t replace the 2nd column with the given <track> text |
38 |
|
|
/; |
39 |
|
|
umask 0002; |
40 |
|
|
getopts('vTKACLf:r:l:c:e:i:a:t:k:o:') || die($usage."\n"); |
41 |
|
|
die("$usage\n") unless @ARGV>0; |
42 |
|
|
die("Only one input file is expected (or '-' for stdin)!\n") unless @ARGV==1; |
43 |
|
|
my $input_gff=$ARGV[0]; |
44 |
|
|
my $outfile=$Getopt::Std::opt_o; |
45 |
|
|
my %ignoredFeatures; |
46 |
|
|
@ignoredFeatures{qw(intron cds_start cds_stop start_codon stop_codon start stop cdsstart cdsstop)}=(); |
47 |
|
|
my %oattrs; #hash with attrs to print |
48 |
|
|
@oattrs{qw(ID transcript_id Parent gene_id gene gene_name)}=(); |
49 |
|
|
my %attrflt; # attribute=>value filter |
50 |
|
|
my %idflt; # hash with IDs to keep |
51 |
|
|
my %atab; # hash with attribute names whose values will be listed as tab delimited |
52 |
|
|
my @atab_cols; |
53 |
|
|
if ($outfile) { |
54 |
|
|
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
55 |
|
|
select(OUTF); |
56 |
|
|
} |
57 |
|
|
# -- |
58 |
|
|
my ($fltinvert, $printGTF, $out_track)= |
59 |
|
|
($Getopt::Std::opt_v, $Getopt::Std::opt_T, $Getopt::Std::opt_t); |
60 |
|
|
my $only_ifCDS=$Getopt::Std::opt_v; |
61 |
|
|
my ($idfile, $avflt, $featlist, $oattrlist)= |
62 |
|
|
($Getopt::Std::opt_i, $Getopt::Std::opt_a, $Getopt::Std::opt_f, $Getopt::Std::opt_k); |
63 |
|
|
my $keepAll=$Getopt::Std::opt_K || $Getopt::Std::opt_A || (lc($oattrlist) eq 'a' || lc($oattrlist) eq 'all'); |
64 |
|
|
my ($r_range, $c_range, $e_range)= |
65 |
|
|
($Getopt::Std::opt_r, $Getopt::Std::opt_c, $Getopt::Std::opt_e); |
66 |
|
|
my $check_range; |
67 |
|
|
if ($r_range) { |
68 |
|
|
$check_range=$r_range; |
69 |
|
|
die("Error: options -r, -c and -e are mutually exclusive!\n") if ($c_range || $e_range); |
70 |
|
|
} |
71 |
|
|
elsif ($c_range) { |
72 |
|
|
$check_range=$c_range; |
73 |
|
|
die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $e_range); |
74 |
|
|
} |
75 |
|
|
elsif ($e_range) { |
76 |
|
|
$check_range=$e_range; |
77 |
|
|
die("Error: options -r, -c and -e are mutually exclusive!\n") if ($r_range || $c_range); |
78 |
|
|
} |
79 |
|
|
my ($flt_chr, $flt_strand); |
80 |
|
|
my @flt_intv; |
81 |
|
|
if ($check_range) { |
82 |
|
|
($flt_chr, my $ck_rlst)=split(/\:/,$check_range); |
83 |
|
|
#die("$usage Incorrect format for the interval list!\n") unless $flt_chr && $ck_rlst; |
84 |
|
|
my $flt_strand=substr($flt_chr,0,1); |
85 |
|
|
if ($flt_strand eq '-' || $flt_strand eq '+') { |
86 |
|
|
substr($flt_chr,0,1)=''; |
87 |
|
|
} |
88 |
|
|
else { |
89 |
|
|
$flt_strand=undef; |
90 |
|
|
my $e=chop($flt_chr); |
91 |
|
|
if ($e eq '-' || $e eq '+') { |
92 |
|
|
$flt_strand=$e; |
93 |
|
|
} |
94 |
|
|
else { $flt_chr.=$e; } |
95 |
|
|
} #no strand |
96 |
|
|
my @flt_intv; |
97 |
|
|
if ($ck_rlst) { |
98 |
|
|
my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$ck_rlst)); |
99 |
|
|
foreach my $d (@rdata) { |
100 |
|
|
($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1]; |
101 |
|
|
} |
102 |
|
|
@flt_intv = sort { $a->[0] <=> $b->[0] } @rdata; |
103 |
|
|
} |
104 |
|
|
} |
105 |
|
|
#my $range=$flt_chr.':'.$ck_rex[0]->[0].'-'.$ex[-1]->[1]; |
106 |
|
|
|
107 |
|
|
$oattrlist='' if $keepAll; |
108 |
|
|
my $tab_attrs=$Getopt::Std::opt_l; |
109 |
|
|
$tab_attrs='ID' if $Getopt::Std::opt_L; |
110 |
|
|
if ($tab_attrs) { |
111 |
|
|
@atab_cols=split(/\,/, $tab_attrs); |
112 |
|
|
@atab{@atab_cols}=(); |
113 |
|
|
} |
114 |
|
|
my $idfileattr; # attribute to use for ID list filtering instead of ID/transcript_id |
115 |
|
|
if ($idfile) { |
116 |
|
|
unless (-f $idfile) { |
117 |
|
|
my ($a,$f)=(split(/\:/,$idfile)); |
118 |
|
|
($idfile, $idfileattr)=($f,$a) if $f; |
119 |
|
|
} |
120 |
|
|
my $idf; |
121 |
|
|
if ($idfile eq '-') { |
122 |
|
|
open($idf, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n"); |
123 |
|
|
} |
124 |
|
|
else { |
125 |
|
|
open($idf, $idfile) || die("Error: cannot open $idfile $!\n"); |
126 |
|
|
} |
127 |
|
|
while (<$idf>) { |
128 |
|
|
my ($id)=(m/(\S+)/); |
129 |
|
|
$idflt{$id}=1 if $id; |
130 |
|
|
} |
131 |
|
|
close($idf); |
132 |
|
|
} |
133 |
|
|
if ($avflt) { |
134 |
|
|
my @avl=split(/\,/, $avflt); |
135 |
|
|
foreach my $avpair (@avl) { |
136 |
|
|
my ($attr, $value)=split(/\s*=\s*/,$avpair,2); |
137 |
|
|
$value=~s/[" ]+$//;$value=~s/^[" ]+//; |
138 |
|
|
$attrflt{$attr}=$value; |
139 |
|
|
} |
140 |
|
|
} |
141 |
|
|
my %featflt; # feature list filter -- if $featlist was given |
142 |
|
|
if ($featlist) { |
143 |
|
|
#add to the list of attributes to be kept |
144 |
|
|
my @fl=split(/\,/, $featlist); |
145 |
|
|
@featflt{@fl}=(); # this is actually an input filter |
146 |
|
|
} |
147 |
|
|
if ($oattrlist) { |
148 |
|
|
#add to the list of attributes to be kept |
149 |
|
|
my @al=split(/\,/, $oattrlist); |
150 |
|
|
@oattrs{@al}=(); |
151 |
|
|
} |
152 |
|
|
# ---- |
153 |
|
|
my %gffrecs; # recID => [ chr, strand, feat_type, \%attrs, fstart, fend, [@exons], [@cds], isgff3, rejected, track, subfeat, fscore ] |
154 |
|
|
# 0 1 2 3 4 5 6 7 8 9 10 , 11, 12 |
155 |
|
|
# recID has the prefix '<chr>|' which should be removed before output |
156 |
|
|
my $gffh; |
157 |
|
|
if ($input_gff eq '-') { |
158 |
|
|
open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN $!\n"); |
159 |
|
|
} |
160 |
|
|
else { |
161 |
|
|
open($gffh, $input_gff) || die("Error opening file $input_gff $!\n"); |
162 |
|
|
} |
163 |
|
|
|
164 |
|
|
loadGff($gffh, \%gffrecs); |
165 |
|
|
my @sorted_recs=sort sortByLoc keys(%gffrecs); |
166 |
|
|
processGffRecs(\%gffrecs, \@sorted_recs); |
167 |
|
|
|
168 |
|
|
# -- |
169 |
|
|
if ($outfile) { |
170 |
|
|
select(STDOUT); |
171 |
|
|
close(OUTF); |
172 |
|
|
} |
173 |
|
|
|
174 |
|
|
#************ Subroutines ************** |
175 |
|
|
sub sortByLoc { |
176 |
|
|
my $da=$gffrecs{$a}; |
177 |
|
|
my $db=$gffrecs{$b}; |
178 |
|
|
if ($$da[0] eq $$db[0]) { |
179 |
|
|
return ($$da[4]==$$db[4]) ? $$da[5] <=> $$db[5] : $$da[4] <=> $$db[4] ; |
180 |
|
|
} |
181 |
|
|
else { return $$da[0] cmp $$db[0] ; } |
182 |
|
|
} |
183 |
|
|
|
184 |
|
|
sub checkOvlExons { |
185 |
|
|
my ($a, $b, $rx)=@_; |
186 |
|
|
return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain |
187 |
|
|
foreach my $x (@$rx) { |
188 |
|
|
return 1 if ($a<=$$x[1] && $b>=$$x[0]); |
189 |
|
|
return 0 if $b<$$x[0]; |
190 |
|
|
} |
191 |
|
|
} |
192 |
|
|
|
193 |
|
|
sub checkWithinExons { |
194 |
|
|
my ($a, $b, $rx)=@_; #checks if interval $a-$b is contained in any @$rx interval |
195 |
|
|
return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain |
196 |
|
|
foreach my $x (@$rx) { |
197 |
|
|
return 1 if ($a>=$$x[0] && $b<=$$x[1]); |
198 |
|
|
return 0 if $b<$$x[0]; |
199 |
|
|
} |
200 |
|
|
} |
201 |
|
|
|
202 |
|
|
|
203 |
|
|
sub loadGff { |
204 |
|
|
my ($gffhandle, $recs)=@_; |
205 |
|
|
while (<$gffhandle>) { |
206 |
|
|
next if m/^\s*#/; |
207 |
|
|
chomp; |
208 |
|
|
my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
209 |
|
|
next unless $fstart>1 && $lnum; |
210 |
|
|
$track=$out_track if $out_track; |
211 |
|
|
next if exists($ignoredFeatures{lc($f)}); |
212 |
|
|
$f='exon' if $f=~m/exon/i; |
213 |
|
|
$f='CDS' if $f=~m/^cds$/i; |
214 |
|
|
my $rejected; |
215 |
|
|
$rejected=1 if ($featlist && not exists($featflt{$f})); |
216 |
|
|
#next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally |
217 |
|
|
my $gff3_ID; |
218 |
|
|
my $gff3_Parent; |
219 |
|
|
my ($gname,$tdescr); |
220 |
|
|
my %attrs; |
221 |
|
|
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
222 |
|
|
#$lnum=~s/"([^"]+)\;([^"]+)"/"$1.$2"/g; #protect ; within text between quotes |
223 |
|
|
my @av=split(/\s*\;\s*/,$lnum); |
224 |
|
|
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
225 |
|
|
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
226 |
|
|
my $isGFF3 = ($gff3_ID || $gff3_Parent); |
227 |
|
|
if ($isGFF3) { # GFF format |
228 |
|
|
$gff3_ID=~tr/"//d; #" |
229 |
|
|
$gff3_Parent=~tr/"//d; #" |
230 |
|
|
$gff3_Parent='' if ($f =~m/RNA/); # we really don't care about parent for RNA features |
231 |
|
|
if ($gff3_ID && !$gff3_Parent) { #top level feature |
232 |
|
|
foreach my $a (@av) { |
233 |
|
|
my ($attr, $value)=split(/\s*=\s*/,$a,2); |
234 |
|
|
$value=~s/[" ]+$//;$value=~s/^[" ]+//; |
235 |
|
|
$attrs{$attr}=$value; |
236 |
|
|
} |
237 |
|
|
if ($f=~m/RNA/i || $f=~/gene/) { |
238 |
|
|
# try to parse the description, if any |
239 |
|
|
|
240 |
|
|
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
241 |
|
|
$tdescr=$1; |
242 |
|
|
} |
243 |
|
|
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
244 |
|
|
$tdescr=$1; |
245 |
|
|
} |
246 |
|
|
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
247 |
|
|
$gname=$1; |
248 |
|
|
} |
249 |
|
|
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
250 |
|
|
$gname=$1; |
251 |
|
|
} |
252 |
|
|
$tdescr='' if ($tdescr eq $gname); |
253 |
|
|
$gname='' if $gname eq $gff3_ID; |
254 |
|
|
} |
255 |
|
|
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs->{"$chr|$gff3_ID"})); |
256 |
|
|
my $recID="$chr|$gff3_ID"; |
257 |
|
|
$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore]; |
258 |
|
|
next; |
259 |
|
|
} # parent/top-level feature |
260 |
|
|
} #GFF |
261 |
|
|
else { #GTF format |
262 |
|
|
if ($f eq 'transcript') { # GTF with parent 'transcript' feature |
263 |
|
|
foreach my $a (@av) { |
264 |
|
|
my ($attr, $value)=split(/\s+"/,$a,2); #" |
265 |
|
|
$value=~s/[" ]+$//; |
266 |
|
|
if ($attr eq 'transcript_id') { |
267 |
|
|
$attr='ID'; |
268 |
|
|
} |
269 |
|
|
elsif ($attr eq 'gene_id') { |
270 |
|
|
$attr='Name'; |
271 |
|
|
} |
272 |
|
|
$attrs{$attr}=$value; |
273 |
|
|
} |
274 |
|
|
my $recID=$attrs{'ID'}; |
275 |
|
|
die("Error: cannot find transcript_id for GTF 'transcript' line:\n$_\n") unless $recID; |
276 |
|
|
die("Error: duplicate feature $recID on $chr\n") if (exists($recs->{"$chr|$recID"})); |
277 |
|
|
$recID=$chr.'|'.$recID; |
278 |
|
|
$recs->{$recID} = [$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [], $isGFF3, $rejected, $track, '', $fscore ]; |
279 |
|
|
next; |
280 |
|
|
} # parent 'transcript' feature in GTF |
281 |
|
|
} |
282 |
|
|
# -------------- exon/CDS line here: |
283 |
|
|
next if ($featlist && !exists($featflt{$f})); |
284 |
|
|
my $recID; |
285 |
|
|
($gname, $tdescr)=(); |
286 |
|
|
if ($isGFF3) { |
287 |
|
|
$recID=$gff3_Parent; |
288 |
|
|
} |
289 |
|
|
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
290 |
|
|
$recID=$1; |
291 |
|
|
$recID=~tr/"//d; #" |
292 |
|
|
} |
293 |
|
|
elsif ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
294 |
|
|
$recID=$chr.'.jsm.'.$lnum; |
295 |
|
|
$gff3_Parent=$recID; |
296 |
|
|
$isGFF3=1; |
297 |
|
|
$f='CDS'; |
298 |
|
|
} |
299 |
|
|
else { |
300 |
|
|
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
301 |
|
|
} |
302 |
|
|
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
303 |
|
|
$gname=$1; |
304 |
|
|
$gname=~tr/"//d; #" |
305 |
|
|
} |
306 |
|
|
$tdescr='' if index($recID, $tdescr)>=0; |
307 |
|
|
$gname='' if index($recID, $gname)>=0; |
308 |
|
|
$recID=$chr.'|'.$recID; |
309 |
|
|
my $ld = $recs->{$recID}; |
310 |
|
|
if ($ld) { #existing entry |
311 |
|
|
my $i=($f eq 'CDS') ? 7 : 6; |
312 |
|
|
my ($lstart, $lend)=($$ld[4], $$ld[5]); |
313 |
|
|
$$ld[4]=$fstart if $fstart<$lstart; |
314 |
|
|
$$ld[5]=$fend if $fend>$lend; |
315 |
|
|
push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $frame]); |
316 |
|
|
if ($f ne 'CDS') { |
317 |
|
|
if ($$ld[11] && $$ld[11] ne $f) { |
318 |
|
|
die("Error: multiple non-CDS subfeatures found for $recID ($$ld[11], $f)\n"); |
319 |
|
|
} |
320 |
|
|
$$ld[11]=$f; |
321 |
|
|
} |
322 |
|
|
} |
323 |
|
|
else { # first time seeing this locus/gene |
324 |
|
|
# get the attributes from this first exon line |
325 |
|
|
if ($gff3_Parent) { #has GFF3 Parent |
326 |
|
|
foreach my $a (@av) { |
327 |
|
|
my ($attr, $value)=split(/\s*=\s*/,$a,2); #" |
328 |
|
|
$attr='ID' if $attr eq 'Parent'; |
329 |
|
|
$value=~s/[" ]+$//;$value=~s/^[" ]+//; |
330 |
|
|
} |
331 |
|
|
} |
332 |
|
|
else { # GTF |
333 |
|
|
foreach my $a (@av) { |
334 |
|
|
my ($attr, $value)=split(/\s+"/,$a,2); #" |
335 |
|
|
$value=~s/[" ]+$//; |
336 |
|
|
if ($attr eq 'transcript_id') { |
337 |
|
|
$attr='ID'; |
338 |
|
|
} |
339 |
|
|
elsif ($attr eq 'gene_id') { |
340 |
|
|
$attr='Name'; |
341 |
|
|
} |
342 |
|
|
$attrs{$attr}=$value unless $attr=~m/^exon/;; |
343 |
|
|
} |
344 |
|
|
} |
345 |
|
|
$recs->{$recID} = ($f eq 'CDS') ? |
346 |
|
|
[$chr, $strand, $f, {%attrs}, $fstart, $fend, [], [[$fstart, $fend, $fscore, $frame]], $isGFF3, $rejected, $track, '','.' ] : |
347 |
|
|
[$chr, $strand, $f, {%attrs}, $fstart, $fend, [[$fstart, $fend, $fscore, $frame]], [], $isGFF3, $rejected, $track, $f,'.' ] ; |
348 |
|
|
# 0 1 2 3 4 5 6(exons) 7 (CDS) 8 9 10 |
349 |
|
|
} |
350 |
|
|
} |
351 |
|
|
close($gffh); |
352 |
|
|
} |
353 |
|
|
|
354 |
|
|
sub processGffRecs { |
355 |
|
|
#return if keys(%recs)==0; |
356 |
|
|
my ($recs, $rlist)=@_; |
357 |
|
|
my @recs_keys; |
358 |
|
|
unless ($rlist) { |
359 |
|
|
@recs_keys=keys(%$recs); |
360 |
|
|
$rlist=\@recs_keys; |
361 |
|
|
} |
362 |
|
|
foreach my $recid (@$rlist) { |
363 |
|
|
my $td=$$recs{$recid}; |
364 |
|
|
# 0 1 2 3 4 5 6 7 8 9 10 11 12 |
365 |
|
|
my ($chr, $strand, $feature, $attrs, $fstart, $fend, $er, $cr, $isGFF3, $rej, $track, $subf, $fscore) = @$td; |
366 |
|
|
next if ($rej && !$fltinvert); |
367 |
|
|
next if ($fltinvert && $featlist && !$rej); |
368 |
|
|
# my ($mstart,$mend)=($fstart, $fend); |
369 |
|
|
my $CDSonly=0; # set to 1 if only CDS segments were given |
370 |
|
|
my $hasCDS=0; |
371 |
|
|
my @ex; |
372 |
|
|
my @cds; |
373 |
|
|
#some records might lack exons, but have only CDS segments (e.g. mitochondrial genes) |
374 |
|
|
if (@$er<1 && @$cr>0) { |
375 |
|
|
@ex = sort { $a->[0] <=> $b->[0] } @$cr; |
376 |
|
|
@cds=@ex; |
377 |
|
|
$CDSonly=1; |
378 |
|
|
$hasCDS=1; |
379 |
|
|
} |
380 |
|
|
else { |
381 |
|
|
@ex = sort { $a->[0] <=> $b->[0] } @$er; |
382 |
|
|
if (@$cr>0) { # sort cds segments too |
383 |
|
|
@cds= sort { $a->[0] <=> $b->[0] } @$cr; |
384 |
|
|
$hasCDS=1; |
385 |
|
|
} |
386 |
|
|
} |
387 |
|
|
# -------------- |
388 |
|
|
# get the more accurate version of the start-end coords for the feature |
389 |
|
|
my $covlen=0; |
390 |
|
|
# map { $covlen+=$_->[1]-$_->[0]+1 } @ex; |
391 |
|
|
my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); |
392 |
|
|
my $gffid=$recid; |
393 |
|
|
substr($gffid, 0, length($chr)+1)=''; |
394 |
|
|
die("Error: gffid ($gffid) not matching attrs\{ID\}(".$attrs-{'ID'}.")!\n") |
395 |
|
|
unless ($gffid eq $attrs->{'ID'}); |
396 |
|
|
my $doprint=1; |
397 |
|
|
#check all the filters |
398 |
|
|
$doprint=0 if ($only_ifCDS && $hasCDS==0); |
399 |
|
|
if ($doprint && $idfile) { |
400 |
|
|
if ($idfileattr) { |
401 |
|
|
$doprint=0 unless exists($idflt{$attrs->{$idfileattr}}); |
402 |
|
|
} |
403 |
|
|
else { |
404 |
|
|
$doprint=0 unless exists($idflt{$gffid}); |
405 |
|
|
} |
406 |
|
|
} |
407 |
|
|
if ($doprint && $avflt) { |
408 |
|
|
my $avfound=0; |
409 |
|
|
foreach my $a (keys(%attrflt)) { |
410 |
|
|
if ($attrflt{$a} eq $attrs->{$a}) { |
411 |
|
|
$avfound=1; |
412 |
|
|
last; |
413 |
|
|
} |
414 |
|
|
} |
415 |
|
|
$doprint=0 unless $avfound; |
416 |
|
|
} |
417 |
|
|
|
418 |
|
|
|
419 |
|
|
if ($doprint && $flt_chr && $flt_chr ne $chr) { |
420 |
|
|
$doprint=0; |
421 |
|
|
} |
422 |
|
|
if ($doprint) { |
423 |
|
|
if ($flt_strand && $flt_strand ne $strand) { |
424 |
|
|
$doprint=0; |
425 |
|
|
} |
426 |
|
|
} |
427 |
|
|
if ($doprint && @flt_intv>0) { |
428 |
|
|
if ($r_range) { #t span overlap any range intervals |
429 |
|
|
$doprint=1 if checkOvlExons($fstart, $fend, \@flt_intv); |
430 |
|
|
} |
431 |
|
|
elsif ($c_range) { #t span contained in any of the intervals |
432 |
|
|
$doprint=1 if checkWithinExons($fstart, $fend, \@flt_intv); |
433 |
|
|
} |
434 |
|
|
elsif ($e_range) { # each exon checked for overlap of any interval |
435 |
|
|
for my $ed (@ex) { |
436 |
|
|
if (checkOvlExons($$ed[0], $$ed[1], \@flt_intv)) { |
437 |
|
|
$doprint=1; |
438 |
|
|
last; |
439 |
|
|
} |
440 |
|
|
} |
441 |
|
|
} |
442 |
|
|
} |
443 |
|
|
|
444 |
|
|
$doprint = ! $doprint if $fltinvert; |
445 |
|
|
next unless $doprint; |
446 |
|
|
#filter passed, print output |
447 |
|
|
if ($tab_attrs) { |
448 |
|
|
my @od; |
449 |
|
|
foreach my $a (@atab_cols) { |
450 |
|
|
if (uc($a) eq 'ID') { |
451 |
|
|
push(@od, $gffid); |
452 |
|
|
next; |
453 |
|
|
} |
454 |
|
|
push(@od, $attrs->{$a}); |
455 |
|
|
} |
456 |
|
|
print join("\t",@od)."\n"; |
457 |
|
|
next; |
458 |
|
|
} |
459 |
|
|
#the core attributes ID, Name must be there already |
460 |
|
|
unless (exists($attrs->{Name})) { |
461 |
|
|
foreach my $name (qw(gene_name gene geneID geneId geneid locus loc ID)) { |
462 |
|
|
if (exists($attrs->{$name})) { |
463 |
|
|
$attrs->{Name}=$attrs->{$name}; |
464 |
|
|
last; |
465 |
|
|
} |
466 |
|
|
} |
467 |
|
|
} |
468 |
|
|
my ($tid, $tname)= (delete($attrs->{ID}), delete($attrs->{Name})); |
469 |
|
|
my ($gene_name, $gene, $locus); |
470 |
|
|
unless ($keepAll) { |
471 |
|
|
if (exists($attrs->{gene_name})) { |
472 |
|
|
$gene_name=delete($attrs->{gene_name}); |
473 |
|
|
$gene_name=undef if ($gene_name eq $tname && !exists($oattrs{gene_name})); |
474 |
|
|
} |
475 |
|
|
if (exists($attrs->{gene})) { |
476 |
|
|
$gene=delete($attrs->{gene}); |
477 |
|
|
$gene=undef if ($gene eq $tname || $gene eq $gene_name) && !exists($oattrs{gene}); |
478 |
|
|
} |
479 |
|
|
if (exists($attrs->{locus})) { |
480 |
|
|
$locus=delete($attrs->{locus}); |
481 |
|
|
$locus=undef if ($locus eq $gene_name || $locus eq $gene_name) && !exists($oattrs{locus}); |
482 |
|
|
} |
483 |
|
|
} |
484 |
|
|
my $tattrs; |
485 |
|
|
if ($printGTF) { #print GTF |
486 |
|
|
$tattrs='transcript_id "'.$tid.'"; gene_id "'.$tname.'";'; |
487 |
|
|
$tattrs.=' gene_name "'.$gene_name.'";' if $gene_name; |
488 |
|
|
$tattrs.=' gene "'.$gene.'";' if $gene; |
489 |
|
|
$tattrs.=' locus "'.$locus.'";' if $locus; |
490 |
|
|
if ($keepAll) { |
491 |
|
|
foreach my $attr (keys(%$attrs)) { |
492 |
|
|
$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";'; |
493 |
|
|
} |
494 |
|
|
} |
495 |
|
|
else { # only a subset of attributes will be shown |
496 |
|
|
foreach my $attr (keys(%$attrs)) { |
497 |
|
|
next unless exists($oattrs{$attr}); |
498 |
|
|
$tattrs.=' '.$attr.' "'.$attrs->{$attr}.'";'; |
499 |
|
|
} |
500 |
|
|
} |
501 |
|
|
} |
502 |
|
|
else { #print GFF3 |
503 |
|
|
$tattrs='Parent='.$tid; |
504 |
|
|
my $pattrs="ID=$tid;Name=$tname"; |
505 |
|
|
$pattrs.='gene_name='.$gene_name.';' if $gene_name; |
506 |
|
|
$pattrs.='gene='.$gene.';' if $gene; |
507 |
|
|
$pattrs.='locus='.$locus.';' if $locus; |
508 |
|
|
if ($keepAll) { |
509 |
|
|
foreach my $attr (keys(%$attrs)) { |
510 |
|
|
my $val=$attrs->{$attr}; |
511 |
|
|
$pattrs.="$attr=$val;"; |
512 |
|
|
} |
513 |
|
|
} |
514 |
|
|
else { # only a subset of attributes will be shown |
515 |
|
|
foreach my $attr (keys(%$attrs)) { |
516 |
|
|
next unless exists($oattrs{$attr}); |
517 |
|
|
my $val=$attrs->{$attr}; |
518 |
|
|
$pattrs.="$attr=$val;"; |
519 |
|
|
} |
520 |
|
|
} |
521 |
|
|
print join("\t",$chr, $track, $feature, $fstart, $fend, $fscore, $strand, '.', $pattrs)."\n"; |
522 |
|
|
} |
523 |
|
|
if ($CDSonly==0) { #write exons only when found in the input |
524 |
|
|
foreach my $ed (@ex) { |
525 |
|
|
print join("\t",$chr, $track, $subf, $$ed[0], $$ed[1], $$ed[2], $strand, $$ed[3], $tattrs)."\n"; |
526 |
|
|
} |
527 |
|
|
} |
528 |
|
|
foreach my $cd (@cds) { |
529 |
|
|
print join("\t",$chr, $track, 'CDS', $$cd[0], $$cd[1], $$cd[2], $strand, $$cd[3], $tattrs)."\n"; |
530 |
|
|
} |
531 |
|
|
} #for each stored transcript |
532 |
|
|
} |