1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
use dbSession; |
6 |
|
7 |
my $usage = q/Usage: |
8 |
prep4sequin.pl -f <fasta_file> [-a <acc2version_table>] [-A] [-t <track>] |
9 |
[-o <out_prefix>] [-c 'organism_code'] [-n '<organism_name>'] <gff_file> |
10 |
|
11 |
|
12 |
Default <track> used for features is 'jigsaw'. |
13 |
|
14 |
Creates a .fsa and .tbl files to be used by tbl2asn |
15 |
|
16 |
/; |
17 |
umask 0002; |
18 |
getopts('n:c:Aa:f:t:o:') || die($usage."\n"); |
19 |
my $fastafile=$Getopt::Std::opt_f || die("$usage A fasta file is required!\n"); |
20 |
my $defpre=$fastafile; |
21 |
$defpre=~s/\.\w+$//; |
22 |
my $orgcode=$Getopt::Std::opt_c; |
23 |
unless ($orgcode) { |
24 |
($orgcode)=($defpre=~m/([a-z,_]+)\d+$/i); |
25 |
$orgcode=~tr/-_//d; |
26 |
$orgcode=uc($orgcode); |
27 |
$orgcode='BB' if $orgcode eq 'BS'; |
28 |
} |
29 |
my $accfile=$Getopt::Std::opt_a; |
30 |
my $accparse=$Getopt::Std::opt_A; |
31 |
|
32 |
my $outprefix=$Getopt::Std::opt_o || $defpre; |
33 |
my $orgname=$Getopt::Std::opt_n; |
34 |
my $track=$Getopt::Std::opt_t || 'jigsaw'; |
35 |
# die("$usage A feature track must be specified!\n"); |
36 |
my $gff=shift(@ARGV) || die("$usage An input gff3 file must be given!\n"); |
37 |
die("$gff: nothing to do.") if (-e $gff && (-s $gff<10)); |
38 |
|
39 |
my $ds = dbSession->new('geanno@NEOSYBASE'); |
40 |
my $sth=$ds->prep('select xref_data from uniprot_xref where xrefdb in ("GenBank", "EMBL") and up_id=?'); |
41 |
my %acc; |
42 |
if ($accfile) { |
43 |
open(ACCFILE, $accfile) || die ("Error opening $accfile!\n"); |
44 |
chomp; |
45 |
while (<ACCFILE>) { |
46 |
my @a=split; |
47 |
$acc{$a[0]}=$a[1]; |
48 |
} |
49 |
close(ACCFILE); |
50 |
} |
51 |
open(FSA, '>'.$outprefix.'.fsa') |
52 |
|| die ("Error creating file $outprefix.fsa !\n"); |
53 |
open(FA, $fastafile)||die("Error opening $fastafile !\n"); |
54 |
my ($seqid, $seqlen); |
55 |
while (<FA>) { |
56 |
if (m/^>(\S+)\s*(.*)/) { |
57 |
last if $seqid;#only one sequence ! |
58 |
($seqid, my $rest)=($1, $2); |
59 |
if ($accfile) { |
60 |
my $accver=$acc{$seqid}; |
61 |
$seqid=$accver if $accver; |
62 |
} |
63 |
my ($enc)=($rest=~m/EN([mr]\d+)/); |
64 |
$orgcode.='_'.$enc if ($enc); |
65 |
if ($accparse) { |
66 |
my ($short, $acc)=split(/\|/,$seqid); |
67 |
if ($acc=~m/(\w+)v(\d+)$/) { |
68 |
$seqid=$1.'.'.$2; |
69 |
} |
70 |
} |
71 |
print FSA '>'.$seqid; |
72 |
print FSA " [organism=$orgname]" if $orgname; |
73 |
print FSA " [gcode=1] [primary=$seqid] $rest\n"; |
74 |
next; |
75 |
} |
76 |
print FSA $_; |
77 |
tr/\n\r\t //d; |
78 |
$seqlen+=length($_); |
79 |
} |
80 |
close(FSA); |
81 |
|
82 |
open(FTBL, '>'.$outprefix.'.tbl') |
83 |
|| die("Error creating $outprefix.tbl file!\n"); |
84 |
open(GFF, $gff) || die("Error opening the input gff file: $gff !\n"); |
85 |
print FTBL ">Feature\t$seqid\n"; |
86 |
#print FTBL join("\t", 1, $seqlen, 'misc_feature')."\n"; |
87 |
my $credits='Predicted annotation generated by JIGSAW and other methods, and provided'. |
88 |
' by Geo Pertea, Mihaela Pertea, Jonathan Allen and Steven Salzberg, Center for'. |
89 |
' Bioinformatics and Computational Biology, University of Maryland'; |
90 |
#print FTBL join("\t", '', '', '','note', $credits)."\n"; |
91 |
|
92 |
my $cparent; #current parent - ASSUMES lines are ordered properly (children following parent) |
93 |
my $cf; #current parent feature name (e.g. 'mRNA' or 'gene') |
94 |
my $cxf; # child feature type: 'exon' or 'CDS' |
95 |
|
96 |
my $cstrand; |
97 |
my ($cstart, $cend); |
98 |
#TODO: also use @ex for mRNA fkey (when UTRs are available) |
99 |
my @cx; #children intervals |
100 |
my ($cdescr, $gene_name, $mcov); |
101 |
my %gn; #unique gene names |
102 |
while (<GFF>) { |
103 |
next if m/^\s*#/; |
104 |
chomp; |
105 |
my ($chr, $ftrack, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/); |
106 |
next unless lc($track) eq lc($ftrack); |
107 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
108 |
$f='exon' if ($f =~m/\-exon$/); |
109 |
my ($fid)=($attrs=~m/ID\s*=\s*"?([^;"]+)/); |
110 |
my ($fp)=($attrs=~m/Parent\s*=\s*"?([^;"]+)/); |
111 |
unless ($fid || $fp) { |
112 |
print STDERR "Warning: feature line $_ has neither ID nor Parent! skipping..\n"; |
113 |
next; |
114 |
} |
115 |
if ($fp) { #child feature |
116 |
if ($cparent && $cparent ne $fp) { |
117 |
die("Error: invalid order of input lines (unknown parent $fp while still processing $cparent\n$_\n"); |
118 |
} |
119 |
if ($cxf && $cxf ne $f) { |
120 |
# TODO: will permit both 'exon' and 'CDS' subfeatures later |
121 |
die("Error: multiple subfeatures are not accepted yet (parent $fp, $cf has subfeature $cxf already)\n"); |
122 |
} |
123 |
$cxf=$f; |
124 |
push(@cx, [$fstart, $fend]); |
125 |
} |
126 |
else { #parent |
127 |
if ($cparent ne $fid) { |
128 |
writeFeature() if @cx>0; |
129 |
$cparent=$fid; |
130 |
$cstrand=$strand; |
131 |
$cf=$f; |
132 |
$cxf=''; |
133 |
($cstart, $cend)=($fstart, $fend); |
134 |
@cx=(); |
135 |
$cdescr=''; |
136 |
} |
137 |
else { |
138 |
die("Error: duplicate parent entry for $fid\n"); |
139 |
} |
140 |
# try to parse the description/annotation, if any |
141 |
($mcov)=($attrs=~m/MCov=([\d\.]+)/); |
142 |
($gene_name)=($attrs=~m/GeneId=([^\;]+)/); |
143 |
if ($attrs=~m/TopHit\s*=\s*"?([^;"]+)/i) { |
144 |
$cdescr=$1; |
145 |
} |
146 |
elsif ($attrs=~m/Descr\s*=\s*"?([^;"]+)/) { |
147 |
$cdescr=$1; |
148 |
} |
149 |
elsif ($attrs=~m/Info\s*=\s*"?([^;"]+)/) { |
150 |
$cdescr=$1; |
151 |
} |
152 |
elsif ($attrs=~m/BestHit\s*=\s*"?([^;"]+)/i) { |
153 |
$cdescr=$1; |
154 |
} |
155 |
elsif ($attrs=~m/Name\s*=\s*"?([^;"]+)/) { |
156 |
$cdescr=$1; |
157 |
} |
158 |
if ($cdescr) { |
159 |
$cdescr=~s/^\s+//;$cdescr=~s/\s+$//; |
160 |
$cdescr='' if index(lc($fid),lc($cdescr))>=0; |
161 |
} |
162 |
# if ($cdescr=~m/hypothetical/ && $cdescr=~m/\bLOC(\d+)/) { |
163 |
# |
164 |
# } |
165 |
} #parent |
166 |
|
167 |
} |
168 |
|
169 |
writeFeature() if @cx>0; |
170 |
|
171 |
close(GFF); |
172 |
close(FTBL); |
173 |
|
174 |
$sth->finish(); |
175 |
$ds->logout(); |
176 |
|
177 |
sub writeFeature { |
178 |
#TODO: use @ex to decide mRNA and CDS |
179 |
$cparent=~tr/.,:/___/s; |
180 |
#print STDERR join("\t",$cparent, $cstrand, $cstart, $cend)."\n"; |
181 |
#my ($pl, $pr)=('<','>'); |
182 |
#$cstart=$pl.$cstart; |
183 |
#$cend=$pr.$cend; |
184 |
@cx= sort { $main::a->[0]<=>$main::b->[0] } @cx; |
185 |
#$cx[0]->[0]=$pl.$cx[0]->[0]; |
186 |
#$cx[-1]->[1]=$pr.$cx[-1]->[1]; |
187 |
if ($cstrand eq '-') { #reverse complement features |
188 |
($cstart, $cend)=($cend, $cstart); |
189 |
#@cx= sort { $main::b->[0]<=>$main::a->[0] } @cx; |
190 |
@cx = map { $_=[$_->[1], $_->[0]] } @cx; |
191 |
@cx= reverse(@cx); |
192 |
} |
193 |
my $s="\t"; |
194 |
my $cid=''; |
195 |
my $infrsim; |
196 |
if ($cdescr) { |
197 |
my $hdescr; |
198 |
($cid, $hdescr)=split(/ /,$cdescr,2); |
199 |
$cdescr=$hdescr if $hdescr; |
200 |
$cdescr=~s/^gid:\S+\s+//; |
201 |
$cdescr=~s/^CDS:\d+\-\d+\s+//; |
202 |
$cdescr=~s/UniRef100_\S+\s+//; |
203 |
if ($cid=~m/^UPr?\|(\w+)/) { |
204 |
my $upid=$1; |
205 |
$cid=up2gb($upid); |
206 |
if ($cid) { |
207 |
$infrsim='AA sequence:INSD:'.$cid; |
208 |
$cid.=' '; |
209 |
} |
210 |
$cdescr=~s/\([^\)]+\)\s*//g; |
211 |
} |
212 |
elsif ($cid=~m/[rpvinfmod]+\|([\w\.]+)/) { #refseq entry |
213 |
$cid=$1; |
214 |
my $refacc=$cid; |
215 |
$refacc.='.1' unless $refacc=~m/\.\d+$/; |
216 |
$infrsim='RNA sequence:RefSeq:'.$refacc; |
217 |
$cid.=' '; |
218 |
} |
219 |
else { |
220 |
$cid=''; |
221 |
} |
222 |
} |
223 |
#print FTBL join($s, '', '', '','locus_tag', $cparent)."\n"; |
224 |
my $note; |
225 |
my $product='hypothetical mRNA'; |
226 |
my $cdsproduct='hypothetical protein'; |
227 |
my $gproduct=''; |
228 |
if ($mcov>50 && $cdescr) { |
229 |
my $d=$cdescr; $d=~s/UniRef100_\S+\s+//; |
230 |
$d=~s/\s+\{[ \.\,\-\"\'\w]+\}$//; |
231 |
# make words start with lowercase: |
232 |
$d =~ s/([A-Z][a-z]{5,})\b/\L$1/g; |
233 |
$product=$d.' (predicted)'; |
234 |
$cdescr=~s/\{([ \.\,\-\"\'\w]+)\}$/\($1\)/; |
235 |
#$note='supporting evidence includes similarity to '.$cid.$cdescr; |
236 |
|
237 |
$note='similar to '.$cid.$cdescr; |
238 |
$cdsproduct=$product; |
239 |
} |
240 |
else { |
241 |
$gene_name=''; |
242 |
} |
243 |
my $lpid; #local protein id |
244 |
if ($gene_name) { |
245 |
my $gnum=++$gn{$gene_name}; |
246 |
if ($gnum>1) { |
247 |
$gene_name.='_'.$gnum; |
248 |
} |
249 |
$gene_name=~s/[ \-_]?predicted$//; |
250 |
$lpid=$orgcode.'_'.$gene_name; |
251 |
#$gene_name.='_predicted'; |
252 |
} |
253 |
else { |
254 |
my $jname=$cparent; |
255 |
$jname=~tr/_//d; |
256 |
$jname=~s/tjsm/jsm/; |
257 |
$lpid=$orgcode.'_jsm'.uc(sprintf('%x',$cstart)).($cstrand eq '+' ? 'f':'r'); |
258 |
#$gene_name='jigsaw_prediction_'.$jname; |
259 |
$gene_name=$lpid; |
260 |
} |
261 |
print FTBL join($s, '<'.$cstart, '>'.$cend, 'gene')."\n"; |
262 |
print FTBL join($s, '', '', '','gene', $gene_name)."\n"; |
263 |
my $jspredline=join($s,'','','','inference','ab initio prediction:JIGSAW:3.2')."\n"; |
264 |
print FTBL $jspredline; |
265 |
if ($infrsim) { |
266 |
print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n"; |
267 |
} |
268 |
|
269 |
#print FTBL join($s, '', '', '','locus_tag', $lpid)."\n"; |
270 |
my $x=shift(@cx); |
271 |
|
272 |
print FTBL join($s, '<'.$$x[0], $$x[1], 'mRNA')."\n"; |
273 |
my $endrna= $cx[-1]->[1]; |
274 |
$cx[-1]->[1]='>'.$endrna; |
275 |
foreach my $xc (@cx) { |
276 |
print FTBL join($s, $$xc[0], $$xc[1])."\n"; |
277 |
} |
278 |
print FTBL $jspredline; |
279 |
#print FTBL join($s, '', '', '','transcript_id', $lpid.'-t')."\n"; |
280 |
print FTBL join($s, '', '', '','product', $product)."\n"; |
281 |
#print FTBL join($s, '', '', '','note', $note)."\n"; |
282 |
print FTBL join($s, $$x[0], $$x[1], 'CDS')."\n"; |
283 |
$cx[-1]->[1]=$endrna; |
284 |
foreach my $xc (@cx) { |
285 |
print FTBL join($s, $$xc[0], $$xc[1])."\n"; |
286 |
} |
287 |
print FTBL $jspredline; |
288 |
if ($infrsim) { |
289 |
print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n"; |
290 |
} |
291 |
# print FTBL join($s, '', '', '','codon_start', 1)."\n"; |
292 |
print FTBL join($s, '', '', '','protein_id', 'gnl|NISC-CON|'.$lpid)."\n"; |
293 |
print FTBL join($s, '', '', '','product', $cdsproduct)."\n"; |
294 |
print FTBL join($s, '', '', '','note', $note)."\n" if $note; |
295 |
|
296 |
} |
297 |
|
298 |
sub up2gb { |
299 |
my $uid=shift(@_); |
300 |
$ds->exec($sth, $uid); |
301 |
my $first; |
302 |
while (my $r=$ds->fetch($sth)) { |
303 |
$first=$$r[0] unless $first; |
304 |
} |
305 |
($first)=($first=~m/^([\w\.]+)/) if $first; |
306 |
return $first; |
307 |
} |