ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/prep4sequin.pl
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (13 years, 2 months ago) by gpertea
Original Path: ann_bin/prep4sequin.pl
File size: 9143 byte(s)
Log Message:
adding misc scripts

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 use dbSession;
6
7 my $usage = q/Usage:
8 prep4sequin.pl -f <fasta_file> [-a <acc2version_table>] [-A] [-t <track>]
9 [-o <out_prefix>] [-c 'organism_code'] [-n '<organism_name>'] <gff_file>
10
11
12 Default <track> used for features is 'jigsaw'.
13
14 Creates a .fsa and .tbl files to be used by tbl2asn
15
16 /;
17 umask 0002;
18 getopts('n:c:Aa:f:t:o:') || die($usage."\n");
19 my $fastafile=$Getopt::Std::opt_f || die("$usage A fasta file is required!\n");
20 my $defpre=$fastafile;
21 $defpre=~s/\.\w+$//;
22 my $orgcode=$Getopt::Std::opt_c;
23 unless ($orgcode) {
24 ($orgcode)=($defpre=~m/([a-z,_]+)\d+$/i);
25 $orgcode=~tr/-_//d;
26 $orgcode=uc($orgcode);
27 $orgcode='BB' if $orgcode eq 'BS';
28 }
29 my $accfile=$Getopt::Std::opt_a;
30 my $accparse=$Getopt::Std::opt_A;
31
32 my $outprefix=$Getopt::Std::opt_o || $defpre;
33 my $orgname=$Getopt::Std::opt_n;
34 my $track=$Getopt::Std::opt_t || 'jigsaw';
35 # die("$usage A feature track must be specified!\n");
36 my $gff=shift(@ARGV) || die("$usage An input gff3 file must be given!\n");
37 die("$gff: nothing to do.") if (-e $gff && (-s $gff<10));
38
39 my $ds = dbSession->new('geanno@NEOSYBASE');
40 my $sth=$ds->prep('select xref_data from uniprot_xref where xrefdb in ("GenBank", "EMBL") and up_id=?');
41 my %acc;
42 if ($accfile) {
43 open(ACCFILE, $accfile) || die ("Error opening $accfile!\n");
44 chomp;
45 while (<ACCFILE>) {
46 my @a=split;
47 $acc{$a[0]}=$a[1];
48 }
49 close(ACCFILE);
50 }
51 open(FSA, '>'.$outprefix.'.fsa')
52 || die ("Error creating file $outprefix.fsa !\n");
53 open(FA, $fastafile)||die("Error opening $fastafile !\n");
54 my ($seqid, $seqlen);
55 while (<FA>) {
56 if (m/^>(\S+)\s*(.*)/) {
57 last if $seqid;#only one sequence !
58 ($seqid, my $rest)=($1, $2);
59 if ($accfile) {
60 my $accver=$acc{$seqid};
61 $seqid=$accver if $accver;
62 }
63 my ($enc)=($rest=~m/EN([mr]\d+)/);
64 $orgcode.='_'.$enc if ($enc);
65 if ($accparse) {
66 my ($short, $acc)=split(/\|/,$seqid);
67 if ($acc=~m/(\w+)v(\d+)$/) {
68 $seqid=$1.'.'.$2;
69 }
70 }
71 print FSA '>'.$seqid;
72 print FSA " [organism=$orgname]" if $orgname;
73 print FSA " [gcode=1] [primary=$seqid] $rest\n";
74 next;
75 }
76 print FSA $_;
77 tr/\n\r\t //d;
78 $seqlen+=length($_);
79 }
80 close(FSA);
81
82 open(FTBL, '>'.$outprefix.'.tbl')
83 || die("Error creating $outprefix.tbl file!\n");
84 open(GFF, $gff) || die("Error opening the input gff file: $gff !\n");
85 print FTBL ">Feature\t$seqid\n";
86 #print FTBL join("\t", 1, $seqlen, 'misc_feature')."\n";
87 my $credits='Predicted annotation generated by JIGSAW and other methods, and provided'.
88 ' by Geo Pertea, Mihaela Pertea, Jonathan Allen and Steven Salzberg, Center for'.
89 ' Bioinformatics and Computational Biology, University of Maryland';
90 #print FTBL join("\t", '', '', '','note', $credits)."\n";
91
92 my $cparent; #current parent - ASSUMES lines are ordered properly (children following parent)
93 my $cf; #current parent feature name (e.g. 'mRNA' or 'gene')
94 my $cxf; # child feature type: 'exon' or 'CDS'
95
96 my $cstrand;
97 my ($cstart, $cend);
98 #TODO: also use @ex for mRNA fkey (when UTRs are available)
99 my @cx; #children intervals
100 my ($cdescr, $gene_name, $mcov);
101 my %gn; #unique gene names
102 while (<GFF>) {
103 next if m/^\s*#/;
104 chomp;
105 my ($chr, $ftrack, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
106 next unless lc($track) eq lc($ftrack);
107 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
108 $f='exon' if ($f =~m/\-exon$/);
109 my ($fid)=($attrs=~m/ID\s*=\s*"?([^;"]+)/);
110 my ($fp)=($attrs=~m/Parent\s*=\s*"?([^;"]+)/);
111 unless ($fid || $fp) {
112 print STDERR "Warning: feature line $_ has neither ID nor Parent! skipping..\n";
113 next;
114 }
115 if ($fp) { #child feature
116 if ($cparent && $cparent ne $fp) {
117 die("Error: invalid order of input lines (unknown parent $fp while still processing $cparent\n$_\n");
118 }
119 if ($cxf && $cxf ne $f) {
120 # TODO: will permit both 'exon' and 'CDS' subfeatures later
121 die("Error: multiple subfeatures are not accepted yet (parent $fp, $cf has subfeature $cxf already)\n");
122 }
123 $cxf=$f;
124 push(@cx, [$fstart, $fend]);
125 }
126 else { #parent
127 if ($cparent ne $fid) {
128 writeFeature() if @cx>0;
129 $cparent=$fid;
130 $cstrand=$strand;
131 $cf=$f;
132 $cxf='';
133 ($cstart, $cend)=($fstart, $fend);
134 @cx=();
135 $cdescr='';
136 }
137 else {
138 die("Error: duplicate parent entry for $fid\n");
139 }
140 # try to parse the description/annotation, if any
141 ($mcov)=($attrs=~m/MCov=([\d\.]+)/);
142 ($gene_name)=($attrs=~m/GeneId=([^\;]+)/);
143 if ($attrs=~m/TopHit\s*=\s*"?([^;"]+)/i) {
144 $cdescr=$1;
145 }
146 elsif ($attrs=~m/Descr\s*=\s*"?([^;"]+)/) {
147 $cdescr=$1;
148 }
149 elsif ($attrs=~m/Info\s*=\s*"?([^;"]+)/) {
150 $cdescr=$1;
151 }
152 elsif ($attrs=~m/BestHit\s*=\s*"?([^;"]+)/i) {
153 $cdescr=$1;
154 }
155 elsif ($attrs=~m/Name\s*=\s*"?([^;"]+)/) {
156 $cdescr=$1;
157 }
158 if ($cdescr) {
159 $cdescr=~s/^\s+//;$cdescr=~s/\s+$//;
160 $cdescr='' if index(lc($fid),lc($cdescr))>=0;
161 }
162 # if ($cdescr=~m/hypothetical/ && $cdescr=~m/\bLOC(\d+)/) {
163 #
164 # }
165 } #parent
166
167 }
168
169 writeFeature() if @cx>0;
170
171 close(GFF);
172 close(FTBL);
173
174 $sth->finish();
175 $ds->logout();
176
177 sub writeFeature {
178 #TODO: use @ex to decide mRNA and CDS
179 $cparent=~tr/.,:/___/s;
180 #print STDERR join("\t",$cparent, $cstrand, $cstart, $cend)."\n";
181 #my ($pl, $pr)=('<','>');
182 #$cstart=$pl.$cstart;
183 #$cend=$pr.$cend;
184 @cx= sort { $main::a->[0]<=>$main::b->[0] } @cx;
185 #$cx[0]->[0]=$pl.$cx[0]->[0];
186 #$cx[-1]->[1]=$pr.$cx[-1]->[1];
187 if ($cstrand eq '-') { #reverse complement features
188 ($cstart, $cend)=($cend, $cstart);
189 #@cx= sort { $main::b->[0]<=>$main::a->[0] } @cx;
190 @cx = map { $_=[$_->[1], $_->[0]] } @cx;
191 @cx= reverse(@cx);
192 }
193 my $s="\t";
194 my $cid='';
195 my $infrsim;
196 if ($cdescr) {
197 my $hdescr;
198 ($cid, $hdescr)=split(/ /,$cdescr,2);
199 $cdescr=$hdescr if $hdescr;
200 $cdescr=~s/^gid:\S+\s+//;
201 $cdescr=~s/^CDS:\d+\-\d+\s+//;
202 $cdescr=~s/UniRef100_\S+\s+//;
203 if ($cid=~m/^UPr?\|(\w+)/) {
204 my $upid=$1;
205 $cid=up2gb($upid);
206 if ($cid) {
207 $infrsim='AA sequence:INSD:'.$cid;
208 $cid.=' ';
209 }
210 $cdescr=~s/\([^\)]+\)\s*//g;
211 }
212 elsif ($cid=~m/[rpvinfmod]+\|([\w\.]+)/) { #refseq entry
213 $cid=$1;
214 my $refacc=$cid;
215 $refacc.='.1' unless $refacc=~m/\.\d+$/;
216 $infrsim='RNA sequence:RefSeq:'.$refacc;
217 $cid.=' ';
218 }
219 else {
220 $cid='';
221 }
222 }
223 #print FTBL join($s, '', '', '','locus_tag', $cparent)."\n";
224 my $note;
225 my $product='hypothetical mRNA';
226 my $cdsproduct='hypothetical protein';
227 my $gproduct='';
228 if ($mcov>50 && $cdescr) {
229 my $d=$cdescr; $d=~s/UniRef100_\S+\s+//;
230 $d=~s/\s+\{[ \.\,\-\"\'\w]+\}$//;
231 # make words start with lowercase:
232 $d =~ s/([A-Z][a-z]{5,})\b/\L$1/g;
233 $product=$d.' (predicted)';
234 $cdescr=~s/\{([ \.\,\-\"\'\w]+)\}$/\($1\)/;
235 #$note='supporting evidence includes similarity to '.$cid.$cdescr;
236
237 $note='similar to '.$cid.$cdescr;
238 $cdsproduct=$product;
239 }
240 else {
241 $gene_name='';
242 }
243 my $lpid; #local protein id
244 if ($gene_name) {
245 my $gnum=++$gn{$gene_name};
246 if ($gnum>1) {
247 $gene_name.='_'.$gnum;
248 }
249 $gene_name=~s/[ \-_]?predicted$//;
250 $lpid=$orgcode.'_'.$gene_name;
251 #$gene_name.='_predicted';
252 }
253 else {
254 my $jname=$cparent;
255 $jname=~tr/_//d;
256 $jname=~s/tjsm/jsm/;
257 $lpid=$orgcode.'_jsm'.uc(sprintf('%x',$cstart)).($cstrand eq '+' ? 'f':'r');
258 #$gene_name='jigsaw_prediction_'.$jname;
259 $gene_name=$lpid;
260 }
261 print FTBL join($s, '<'.$cstart, '>'.$cend, 'gene')."\n";
262 print FTBL join($s, '', '', '','gene', $gene_name)."\n";
263 my $jspredline=join($s,'','','','inference','ab initio prediction:JIGSAW:3.2')."\n";
264 print FTBL $jspredline;
265 if ($infrsim) {
266 print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n";
267 }
268
269 #print FTBL join($s, '', '', '','locus_tag', $lpid)."\n";
270 my $x=shift(@cx);
271
272 print FTBL join($s, '<'.$$x[0], $$x[1], 'mRNA')."\n";
273 my $endrna= $cx[-1]->[1];
274 $cx[-1]->[1]='>'.$endrna;
275 foreach my $xc (@cx) {
276 print FTBL join($s, $$xc[0], $$xc[1])."\n";
277 }
278 print FTBL $jspredline;
279 #print FTBL join($s, '', '', '','transcript_id', $lpid.'-t')."\n";
280 print FTBL join($s, '', '', '','product', $product)."\n";
281 #print FTBL join($s, '', '', '','note', $note)."\n";
282 print FTBL join($s, $$x[0], $$x[1], 'CDS')."\n";
283 $cx[-1]->[1]=$endrna;
284 foreach my $xc (@cx) {
285 print FTBL join($s, $$xc[0], $$xc[1])."\n";
286 }
287 print FTBL $jspredline;
288 if ($infrsim) {
289 print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n";
290 }
291 # print FTBL join($s, '', '', '','codon_start', 1)."\n";
292 print FTBL join($s, '', '', '','protein_id', 'gnl|NISC-CON|'.$lpid)."\n";
293 print FTBL join($s, '', '', '','product', $cdsproduct)."\n";
294 print FTBL join($s, '', '', '','note', $note)."\n" if $note;
295
296 }
297
298 sub up2gb {
299 my $uid=shift(@_);
300 $ds->exec($sth, $uid);
301 my $first;
302 while (my $r=$ds->fetch($sth)) {
303 $first=$$r[0] unless $first;
304 }
305 ($first)=($first=~m/^([\w\.]+)/) if $first;
306 return $first;
307 }

Properties

Name Value
svn:executable *