1 |
gpertea |
23 |
#!/bin/tcsh -f |
2 |
|
|
|
3 |
|
|
#set pmap = 'unipr.blat.gff3' |
4 |
|
|
set pmap = 'pexo_exon.gff3' |
5 |
|
|
#set gmap = 'refseq_exon.gff3' |
6 |
|
|
set gmap = 'sim4.gff3' |
7 |
|
|
set ptrack='p2g_exo' |
8 |
|
|
set mtrack='sim4cc' |
9 |
|
|
set protdb = '~/ann/protdb/unipr_mammals.fa.cidx' |
10 |
|
|
|
11 |
|
|
if ($2'x' == 'x') then |
12 |
|
|
echo "Usage: rebuild_ann.csh [<step>] <jigsaw_suffix> <dirs..>" |
13 |
|
|
echo " Must be run in the '.../encode/<species>/jigsaw' directory " |
14 |
|
|
echo " where the <dirs..> are" |
15 |
|
|
echo " <jigsaw_suffix>.jgff is/was used as jigsaw output suffix" |
16 |
|
|
echo " Example: " |
17 |
|
|
echo " rebuild_ann.csh 1 wpmap ra_0?" |
18 |
|
|
echo "" |
19 |
|
|
echo ' WARNING: make sure variables $pmap, $ptrack, $gmap and $mtrack are correct!' |
20 |
|
|
echo " (currently they are: $pmap (track: $ptrack) and $gmap (track: $mtrack))" |
21 |
|
|
echo "<step> is an optional numeric argument denoting the stage at which" |
22 |
|
|
echo " reprocessing should start (default is 2):" |
23 |
|
|
echo " 0: rerun jigsaw with <dirs> and evidence_<jigsaw_suffix> " |
24 |
|
|
echo " 1: combine the 3 jigsaw output file for each <dirs..>" |
25 |
|
|
echo " 2: (default) annotate jigsaw results from <dir>.recon_<jigsaw_suffix>.jgff.gff3" |
26 |
|
|
echo " 3: build the submit & gview files from <dir>.ann.recon_<jigsaw_suffix>.jgff.gff3" |
27 |
|
|
exit 1 |
28 |
|
|
endif |
29 |
|
|
|
30 |
|
|
set jstart=$1 |
31 |
|
|
set jstageinfo="annotation of predicted gene models" |
32 |
|
|
set stest=`echo "$jstart" | egrep -c '^[0-4]$'` |
33 |
|
|
if ( ${%jstart} > 1 || $stest < 1 ) then |
34 |
|
|
set jstart=2 |
35 |
|
|
else |
36 |
|
|
shift |
37 |
|
|
if ($jstart == 0) set jstageinfo="running jigsaw 3 times" |
38 |
|
|
if ($jstart == 1) set jstageinfo="combine the 3 jigsaw runs" |
39 |
|
|
if ($jstart == 3) set jstageinfo="generate submit and gview files" |
40 |
|
|
endif |
41 |
|
|
set jsuf=$1 |
42 |
|
|
set jsufbase=$jsuf |
43 |
|
|
shift |
44 |
|
|
|
45 |
|
|
set gvdir='/mylocal/geo/httpd/html/data/enc' |
46 |
|
|
if ( ! -d $gvdir ) then |
47 |
|
|
echo "Error: $gvdir not found!" |
48 |
|
|
exit 1 |
49 |
|
|
endif |
50 |
|
|
|
51 |
|
|
set startdir=$PWD #this doesn't resolve simbolic links |
52 |
|
|
set basedir=`pwd` |
53 |
|
|
set updir=$basedir:h # should be ~/ann/encode/spdir (remove ./jigsaw) |
54 |
|
|
set spdir=$updir:t #species directory name |
55 |
|
|
|
56 |
|
|
set tgtdir=$gvdir/$spdir |
57 |
|
|
if ( -d $tgtdir"_$jsufbase" ) then |
58 |
|
|
set tgtdir=$tgtdir"_$jsufbase" |
59 |
|
|
endif |
60 |
|
|
if ( ! -d $tgtdir) then |
61 |
|
|
mkdir -p $tgtdir |
62 |
|
|
endif |
63 |
|
|
|
64 |
|
|
set map=$updir/fa2encode.map |
65 |
|
|
if ( ! -f $map ) then |
66 |
|
|
echo "Error: $map file not found!" |
67 |
|
|
exit 1 |
68 |
|
|
endif |
69 |
|
|
|
70 |
|
|
|
71 |
|
|
echo "# Starting from stage $jstart ($jstageinfo)" |
72 |
|
|
if ($jstart < 2 ) then |
73 |
|
|
#locate jigsaw's evidence file |
74 |
|
|
set evf=`ls {ev,weight}*_$jsuf.jigsaw exp*/{exp,weight}*_$jsuf.jigsaw` |
75 |
|
|
if ( $%evf > 0 ) then |
76 |
|
|
set evf=$basedir/$evf[1] |
77 |
|
|
echo "Using jigsaw evidence file: $evf" |
78 |
|
|
else |
79 |
|
|
echo "Error: no evidence file found $evf" |
80 |
|
|
exit 1 |
81 |
|
|
endif |
82 |
|
|
#build the dirlist |
83 |
|
|
endif |
84 |
|
|
set jsuf="$jsuf.jgff" |
85 |
|
|
#echo "Jigsaw file suffix: $jsuf" |
86 |
|
|
|
87 |
|
|
foreach d ( $argv ) # { <------- for each directory |
88 |
|
|
echo "====================> $d <===================" |
89 |
|
|
if ( ! -d $d ) then |
90 |
|
|
echo "ERROR: directory $d not found!" |
91 |
|
|
exit 2 |
92 |
|
|
endif |
93 |
|
|
|
94 |
|
|
set j_gff = $d.recon_$jsuf.gff3 |
95 |
|
|
set j_anngff = $d.ann.recon_$jsuf.gff3 |
96 |
|
|
|
97 |
|
|
if ($jstart < 2) then |
98 |
|
|
#running linear jigsaw here.. change this if you want something else |
99 |
|
|
#prepare dirlist |
100 |
|
|
set jdirf=dirlst-$d.jigsaw |
101 |
|
|
echo "$basedir/$d" > $jdirf |
102 |
|
|
if ($jstart < 1) then |
103 |
|
|
echo "$basedir/${d}f" >> $jdirf |
104 |
|
|
echo "$basedir/${d}r" >> $jdirf |
105 |
|
|
set jcmd="run_jigsaw.pl -e $evf -l $jdirf -L -o $jsuf" |
106 |
|
|
echo "..running: $jcmd" |
107 |
|
|
$jcmd |
108 |
|
|
endif |
109 |
|
|
#-- now recombine |
110 |
|
|
set df=$d'f' |
111 |
|
|
set dr=$d'r' |
112 |
|
|
recon_jigsaw.pl -P $d/$d.fa -o $j_gff $d/$d.$jsuf $df/$df.$jsuf $dr/$dr.$jsuf |
113 |
|
|
#validate the file one more time: |
114 |
|
|
set badcds=$d.recon_$jsuf.bad_JIGSAW.gtf |
115 |
|
|
gffilter -g $d/$d.fa -b $badcds $j_gff |
116 |
|
|
if (-s $badcds) then |
117 |
|
|
echo "**** ERROR found at CDS validation for $j_gff! (see $badcds)" |
118 |
|
|
else |
119 |
|
|
/bin/rm -f $badcds |
120 |
|
|
endif |
121 |
|
|
endif # jstart is 0 or 1 |
122 |
|
|
|
123 |
|
|
#stage 2: annotation |
124 |
|
|
if ($jstart < 3) then |
125 |
|
|
if (-s $j_gff) then |
126 |
|
|
/bin/rm -f $d.[pg]map.i{it,fa} |
127 |
|
|
#gff2iit -o $d.pmap -t $ptrack $d/$d.$pmap |
128 |
|
|
gtf2gff -t $ptrack $d/$d.$pmap > $d.iit_pmap.gff3 |
129 |
|
|
gtf2gff -t 'pmap' $d/$d.pmap_exon.gff3 >> $d.iit_pmap.gff3 |
130 |
|
|
gff2iit -o $d.pmap $d.iit_pmap.gff3 |
131 |
|
|
#gff2iit -o $d.gmap -t $mtrack $d/$d.$gmap |
132 |
|
|
gtf2gff -t $mtrack $d/$d.$gmap | perl -pe 's/\.mrna(\d+)/\.m$1/' > $d.iit_gmap.gff3 |
133 |
|
|
gtf2gff -t 'gmap' $d/$d.gmap_exon.gff3 >> $d.iit_gmap.gff3 |
134 |
|
|
gff2iit -o $d.gmap -t $mtrack $d.iit_gmap.gff3 |
135 |
|
|
|
136 |
|
|
#echo "gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff" |
137 |
|
|
#gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff |
138 |
|
|
#echo "gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff" |
139 |
|
|
gffann.pl -o $j_anngff -t $d -m $d.gmap.iit -p $d.pmap.iit -P $protdb $j_gff |
140 |
|
|
# -- temporary: throw in all the mapping evidence |
141 |
|
|
gtf2gff -t $mtrack $d/$d.$gmap > $d.top5gmap.gff3 |
142 |
|
|
gtf2gff -t $ptrack $d/$d.$pmap > $d.top5pmap.gff3 |
143 |
|
|
gtf2gff -t 'gmap' $d/$d.gmap_exon.gff3 >> $d.top5gmap.gff3 |
144 |
|
|
|
145 |
|
|
gtf2gff -t 'pmap' $d/$d.pmap_exon.gff3 >> $d.top5pmap.gff3 |
146 |
|
|
# gtf2gff -t 'blat' $d/$d.pblat.gff3 >> $d.top5pmap.gff3 |
147 |
|
|
else #no predictions; just show whatever mappings we got |
148 |
|
|
cp $j_gff $j_anngff |
149 |
|
|
gtf2gff -t $mtrack $d/$d.$gmap | perl -pe 's/\.mrna(\d+)/\.m$1/' > $d.top5gmap.gff3 |
150 |
|
|
gtf2gff -t $ptrack $d/$d.$pmap > $d.top5pmap.gff3 |
151 |
|
|
gtf2gff -t 'gmap' $d/$d.gmap_exon.gff3 >> $d.top5gmap.gff3 |
152 |
|
|
gtf2gff -t 'pmap' $d/$d.pmap_exon.gff3 >> $d.top5pmap.gff3 |
153 |
|
|
# gtf2gff -t 'blat' $d/$d.pblat.gff3 >> $d.top5pmap.gff3 |
154 |
|
|
endif |
155 |
|
|
endif # jstart was 0,1 or 2 |
156 |
|
|
|
157 |
|
|
# stage 3: prepare submit and gview files |
158 |
|
|
set fo=$d:s/_// |
159 |
|
|
if ( ! -d submit ) then |
160 |
|
|
mkdir submit |
161 |
|
|
endif |
162 |
|
|
echo "jgff2tbl -f $d/$d.fa -A -o submit/$fo $j_anngff" |
163 |
|
|
jgff2tbl -f $d/$d.fa -A -o submit/$fo $j_anngff |
164 |
|
|
# -- now prepare and copy the gview files |
165 |
|
|
if ( ! -f $gvdir/tracks.def ) then |
166 |
|
|
echo "Error: file $gvdir/tracks.def not found!" |
167 |
|
|
exit 3 |
168 |
|
|
endif |
169 |
|
|
set m=`grep $d < $map | tr "\t" '~'` |
170 |
|
|
set m = $m:s/~/ / |
171 |
|
|
set ar=($m) |
172 |
|
|
set fn=$ar[1] |
173 |
|
|
if ($fn != $d) then |
174 |
|
|
echo "Error: grep for $d in $map didn't work properly!" |
175 |
|
|
exit 1 |
176 |
|
|
endif |
177 |
|
|
set en=$ar[2] |
178 |
|
|
set fsub=$d:s/_// |
179 |
|
|
#echo "> preparing $en ($fn) for gview in $gvdir/$spdir/" |
180 |
|
|
set tgf=../$en.gv.gff3 |
181 |
|
|
set bkm=../$en.bkm |
182 |
|
|
/bin/rm -f $tgf |
183 |
|
|
# set gnum=`cat $j_anngff | grep -c mRNA` |
184 |
|
|
# set gl=`cat $j_anngff | perl -ne 'print $1."," if m/GeneId=([\w\-]+)/'` |
185 |
|
|
# echo "$en\t$fn\t$gnum\t$gl" >> $tgtdir/$sm |
186 |
|
|
if ( -f submit/$fsub.tbl.gff3 ) then |
187 |
|
|
~/ann/bin/fltgff4gv -o $tgf -b $bkm submit/$fsub.tbl.gff3 |
188 |
|
|
else |
189 |
|
|
~/ann/bin/fltgff4gv -t jigsaw -o $tgf -b $bkm $j_anngff |
190 |
|
|
endif |
191 |
|
|
#~/ann/bin/fltgff4gv -t gmap < $fn/$fn.refseq_exon.gff3 >> $tgf |
192 |
|
|
#~/ann/bin/fltgff4gv -t gmap < $fn/$fn.sim4exon.gff >> $tgf |
193 |
|
|
~/ann/bin/fltgff4gv < $fn.top5gmap.gff3 >> $tgf |
194 |
|
|
~/ann/bin/fltgff4gv < $fn.top5pmap.gff3 >> $tgf |
195 |
|
|
sed 's/GlimmerHMM/glimmerhmm/' < $fn/$fn.glimmerhmm_pdom.gff >> $tgf |
196 |
|
|
cat $fn/$fn.twinscan.gff >> $tgf |
197 |
|
|
echo "running: gff2gview -d $tgtdir -o $en -T $gvdir/tracks.def $fn/$fn.fa $tgf" |
198 |
|
|
gff2gview -d $tgtdir -o $en -T $gvdir/tracks.def $fn/$fn.fa $tgf |
199 |
|
|
if ( -f $bkm ) then |
200 |
|
|
cp $bkm $tgtdir |
201 |
|
|
endif |
202 |
|
|
#ls -l $tgtdir/$en.* |
203 |
|
|
# prepare the sequin files for reviewing |
204 |
|
|
#mkdir sequin |
205 |
|
|
ln -s ../submit/$fo.{tbl,fsa} sequin/ |
206 |
|
|
tbl2asn -p sequin -t template.sequin |
207 |
|
|
end # } for each directory |
208 |
|
|
|
209 |
|
|
#-- rebuild the gene counts |
210 |
|
|
|
211 |
|
|
cd $tgtdir |
212 |
|
|
set gtab='gcount.tab' |
213 |
|
|
/bin/rm -f $gtab |
214 |
|
|
foreach bkm ( *.bkm ) |
215 |
|
|
set gnum=`wc -l < $bkm` |
216 |
|
|
set gl=`cut -f2 -d '|' $bkm | cut -f1 | perl -ne 'chomp;print "$_," unless m/\d+_jsm\w+$/'` |
217 |
|
|
set en=$bkm:r |
218 |
|
|
set fn=`grep $en < $map | cut -f1` |
219 |
|
|
echo "$en\t$fn\t$gnum\t$gl" >> $gtab |
220 |
|
|
end #for each gene bookmark file |
221 |
|
|
cd $basedir |
222 |
|
|
#echo "done." |
223 |
|
|
exit 0 |