ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/rebuild_ann.csh
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (13 years ago) by gpertea
Original Path: ann_bin/rebuild_ann.csh
File size: 7479 byte(s)
Log Message:
adding misc scripts

Line User Rev File contents
1 gpertea 23 #!/bin/tcsh -f
2    
3     #set pmap = 'unipr.blat.gff3'
4     set pmap = 'pexo_exon.gff3'
5     #set gmap = 'refseq_exon.gff3'
6     set gmap = 'sim4.gff3'
7     set ptrack='p2g_exo'
8     set mtrack='sim4cc'
9     set protdb = '~/ann/protdb/unipr_mammals.fa.cidx'
10    
11     if ($2'x' == 'x') then
12     echo "Usage: rebuild_ann.csh [<step>] <jigsaw_suffix> <dirs..>"
13     echo " Must be run in the '.../encode/<species>/jigsaw' directory "
14     echo " where the <dirs..> are"
15     echo " <jigsaw_suffix>.jgff is/was used as jigsaw output suffix"
16     echo " Example: "
17     echo " rebuild_ann.csh 1 wpmap ra_0?"
18     echo ""
19     echo ' WARNING: make sure variables $pmap, $ptrack, $gmap and $mtrack are correct!'
20     echo " (currently they are: $pmap (track: $ptrack) and $gmap (track: $mtrack))"
21     echo "<step> is an optional numeric argument denoting the stage at which"
22     echo " reprocessing should start (default is 2):"
23     echo " 0: rerun jigsaw with <dirs> and evidence_<jigsaw_suffix> "
24     echo " 1: combine the 3 jigsaw output file for each <dirs..>"
25     echo " 2: (default) annotate jigsaw results from <dir>.recon_<jigsaw_suffix>.jgff.gff3"
26     echo " 3: build the submit & gview files from <dir>.ann.recon_<jigsaw_suffix>.jgff.gff3"
27     exit 1
28     endif
29    
30     set jstart=$1
31     set jstageinfo="annotation of predicted gene models"
32     set stest=`echo "$jstart" | egrep -c '^[0-4]$'`
33     if ( ${%jstart} > 1 || $stest < 1 ) then
34     set jstart=2
35     else
36     shift
37     if ($jstart == 0) set jstageinfo="running jigsaw 3 times"
38     if ($jstart == 1) set jstageinfo="combine the 3 jigsaw runs"
39     if ($jstart == 3) set jstageinfo="generate submit and gview files"
40     endif
41     set jsuf=$1
42     set jsufbase=$jsuf
43     shift
44    
45     set gvdir='/mylocal/geo/httpd/html/data/enc'
46     if ( ! -d $gvdir ) then
47     echo "Error: $gvdir not found!"
48     exit 1
49     endif
50    
51     set startdir=$PWD #this doesn't resolve simbolic links
52     set basedir=`pwd`
53     set updir=$basedir:h # should be ~/ann/encode/spdir (remove ./jigsaw)
54     set spdir=$updir:t #species directory name
55    
56     set tgtdir=$gvdir/$spdir
57     if ( -d $tgtdir"_$jsufbase" ) then
58     set tgtdir=$tgtdir"_$jsufbase"
59     endif
60     if ( ! -d $tgtdir) then
61     mkdir -p $tgtdir
62     endif
63    
64     set map=$updir/fa2encode.map
65     if ( ! -f $map ) then
66     echo "Error: $map file not found!"
67     exit 1
68     endif
69    
70    
71     echo "# Starting from stage $jstart ($jstageinfo)"
72     if ($jstart < 2 ) then
73     #locate jigsaw's evidence file
74     set evf=`ls {ev,weight}*_$jsuf.jigsaw exp*/{exp,weight}*_$jsuf.jigsaw`
75     if ( $%evf > 0 ) then
76     set evf=$basedir/$evf[1]
77     echo "Using jigsaw evidence file: $evf"
78     else
79     echo "Error: no evidence file found $evf"
80     exit 1
81     endif
82     #build the dirlist
83     endif
84     set jsuf="$jsuf.jgff"
85     #echo "Jigsaw file suffix: $jsuf"
86    
87     foreach d ( $argv ) # { <------- for each directory
88     echo "====================> $d <==================="
89     if ( ! -d $d ) then
90     echo "ERROR: directory $d not found!"
91     exit 2
92     endif
93    
94     set j_gff = $d.recon_$jsuf.gff3
95     set j_anngff = $d.ann.recon_$jsuf.gff3
96    
97     if ($jstart < 2) then
98     #running linear jigsaw here.. change this if you want something else
99     #prepare dirlist
100     set jdirf=dirlst-$d.jigsaw
101     echo "$basedir/$d" > $jdirf
102     if ($jstart < 1) then
103     echo "$basedir/${d}f" >> $jdirf
104     echo "$basedir/${d}r" >> $jdirf
105     set jcmd="run_jigsaw.pl -e $evf -l $jdirf -L -o $jsuf"
106     echo "..running: $jcmd"
107     $jcmd
108     endif
109     #-- now recombine
110     set df=$d'f'
111     set dr=$d'r'
112     recon_jigsaw.pl -P $d/$d.fa -o $j_gff $d/$d.$jsuf $df/$df.$jsuf $dr/$dr.$jsuf
113     #validate the file one more time:
114     set badcds=$d.recon_$jsuf.bad_JIGSAW.gtf
115     gffilter -g $d/$d.fa -b $badcds $j_gff
116     if (-s $badcds) then
117     echo "**** ERROR found at CDS validation for $j_gff! (see $badcds)"
118     else
119     /bin/rm -f $badcds
120     endif
121     endif # jstart is 0 or 1
122    
123     #stage 2: annotation
124     if ($jstart < 3) then
125     if (-s $j_gff) then
126     /bin/rm -f $d.[pg]map.i{it,fa}
127     #gff2iit -o $d.pmap -t $ptrack $d/$d.$pmap
128     gtf2gff -t $ptrack $d/$d.$pmap > $d.iit_pmap.gff3
129     gtf2gff -t 'pmap' $d/$d.pmap_exon.gff3 >> $d.iit_pmap.gff3
130     gff2iit -o $d.pmap $d.iit_pmap.gff3
131     #gff2iit -o $d.gmap -t $mtrack $d/$d.$gmap
132     gtf2gff -t $mtrack $d/$d.$gmap | perl -pe 's/\.mrna(\d+)/\.m$1/' > $d.iit_gmap.gff3
133     gtf2gff -t 'gmap' $d/$d.gmap_exon.gff3 >> $d.iit_gmap.gff3
134     gff2iit -o $d.gmap -t $mtrack $d.iit_gmap.gff3
135    
136     #echo "gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff"
137     #gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff
138     #echo "gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff"
139     gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff
140     # -- temporary: throw in all the mapping evidence
141     gtf2gff -t $mtrack $d/$d.$gmap > $d.top5gmap.gff3
142     gtf2gff -t $ptrack $d/$d.$pmap > $d.top5pmap.gff3
143     gtf2gff -t 'gmap' $d/$d.gmap_exon.gff3 >> $d.top5gmap.gff3
144    
145     gtf2gff -t 'pmap' $d/$d.pmap_exon.gff3 >> $d.top5pmap.gff3
146     # gtf2gff -t 'blat' $d/$d.pblat.gff3 >> $d.top5pmap.gff3
147     else #no predictions; just show whatever mappings we got
148     cp $j_gff $j_anngff
149     gtf2gff -t $mtrack $d/$d.$gmap | perl -pe 's/\.mrna(\d+)/\.m$1/' > $d.top5gmap.gff3
150     gtf2gff -t $ptrack $d/$d.$pmap > $d.top5pmap.gff3
151     gtf2gff -t 'gmap' $d/$d.gmap_exon.gff3 >> $d.top5gmap.gff3
152     gtf2gff -t 'pmap' $d/$d.pmap_exon.gff3 >> $d.top5pmap.gff3
153     # gtf2gff -t 'blat' $d/$d.pblat.gff3 >> $d.top5pmap.gff3
154     endif
155     endif # jstart was 0,1 or 2
156    
157     # stage 3: prepare submit and gview files
158     set fo=$d:s/_//
159     if ( ! -d submit ) then
160     mkdir submit
161     endif
162     echo "jgff2tbl -f $d/$d.fa -A -o submit/$fo $j_anngff"
163     jgff2tbl -f $d/$d.fa -A -o submit/$fo $j_anngff
164     # -- now prepare and copy the gview files
165     if ( ! -f $gvdir/tracks.def ) then
166     echo "Error: file $gvdir/tracks.def not found!"
167     exit 3
168     endif
169     set m=`grep $d < $map | tr "\t" '~'`
170     set m = $m:s/~/ /
171     set ar=($m)
172     set fn=$ar[1]
173     if ($fn != $d) then
174     echo "Error: grep for $d in $map didn't work properly!"
175     exit 1
176     endif
177     set en=$ar[2]
178     set fsub=$d:s/_//
179     #echo "> preparing $en ($fn) for gview in $gvdir/$spdir/"
180     set tgf=../$en.gv.gff3
181     set bkm=../$en.bkm
182     /bin/rm -f $tgf
183     # set gnum=`cat $j_anngff | grep -c mRNA`
184     # set gl=`cat $j_anngff | perl -ne 'print $1."," if m/GeneId=([\w\-]+)/'`
185     # echo "$en\t$fn\t$gnum\t$gl" >> $tgtdir/$sm
186     if ( -f submit/$fsub.tbl.gff3 ) then
187     ~/ann/bin/fltgff4gv -o $tgf -b $bkm submit/$fsub.tbl.gff3
188     else
189     ~/ann/bin/fltgff4gv -t jigsaw -o $tgf -b $bkm $j_anngff
190     endif
191     #~/ann/bin/fltgff4gv -t gmap < $fn/$fn.refseq_exon.gff3 >> $tgf
192     #~/ann/bin/fltgff4gv -t gmap < $fn/$fn.sim4exon.gff >> $tgf
193     ~/ann/bin/fltgff4gv < $fn.top5gmap.gff3 >> $tgf
194     ~/ann/bin/fltgff4gv < $fn.top5pmap.gff3 >> $tgf
195     sed 's/GlimmerHMM/glimmerhmm/' < $fn/$fn.glimmerhmm_pdom.gff >> $tgf
196     cat $fn/$fn.twinscan.gff >> $tgf
197     echo "running: gff2gview -d $tgtdir -o $en -T $gvdir/tracks.def $fn/$fn.fa $tgf"
198     gff2gview -d $tgtdir -o $en -T $gvdir/tracks.def $fn/$fn.fa $tgf
199     if ( -f $bkm ) then
200     cp $bkm $tgtdir
201     endif
202     #ls -l $tgtdir/$en.*
203     # prepare the sequin files for reviewing
204     #mkdir sequin
205     ln -s ../submit/$fo.{tbl,fsa} sequin/
206     tbl2asn -p sequin -t template.sequin
207     end # } for each directory
208    
209     #-- rebuild the gene counts
210    
211     cd $tgtdir
212     set gtab='gcount.tab'
213     /bin/rm -f $gtab
214     foreach bkm ( *.bkm )
215     set gnum=`wc -l < $bkm`
216     set gl=`cut -f2 -d '|' $bkm | cut -f1 | perl -ne 'chomp;print "$_," unless m/\d+_jsm\w+$/'`
217     set en=$bkm:r
218     set fn=`grep $en < $map | cut -f1`
219     echo "$en\t$fn\t$gnum\t$gl" >> $gtab
220     end #for each gene bookmark file
221     cd $basedir
222     #echo "done."
223     exit 0

Properties

Name Value
svn:executable *