ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/cuffdiff_loc_check.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 14591 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 cuffdiff_loc_check.pl [-c <min_cov>] [-f <min_fpkm>] [-l <labels>] [-b <bamfiles>] \
8 <gene_exp.diff> <cuffcompare.loci> <cuffcompare.tracking> <transfrags1.ifa> <transfrags2.ifa>
9
10 (assume that transfrags?.ifa files were created using gff2iit -XTA )
11 If BAM files are provided for specificity validation, they must be indexed.
12 /;
13 umask 0002;
14 getopts('b:f:c:l:o:') || die($usage."\n");
15 my $outfile=$Getopt::Std::opt_o;
16 my $lnames=$Getopt::Std::opt_l;
17 my $bamfiles=$Getopt::Std::opt_b;
18 my $MIN_COV=$Getopt::Std::opt_c || 2;
19 my $MIN_FPKM=$Getopt::Std::opt_f || 2;
20 if ($outfile) {
21 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
22 select(OUTF);
23 }
24
25 my @labels=split(/[\,\s\;]/,$lnames) if $lnames;
26 my @bam=split(/[\,\s\;]/,$bamfiles) if $bamfiles;
27 foreach my $bfile (@bam) {
28 die("Error: file $bfile not found!\n") unless -f $bfile;
29 die("Error: bam index $bfile.bai not found!\n") unless -f $bfile.'.bai';
30 }
31 unless ($labels[1]) {
32 @labels=('Exp1','Exp2');
33 }
34 # --
35 my ($gfexp, $lfile, $kfile, $tfile1, $tfile2)=@ARGV;
36 die($usage) unless $tfile2;
37 # 0 1 2 3 4 5 6
38 my %locdiff; # XLOC_id => [chr, c_start, c_end, fpkm_1, fpkm_2, p-value, is_significant]
39 # cuffdiff loci data
40
41 # 0 1 2 3 4 5 6
42 my %loci; # locus_id => [chr, strand, start, end, [refs], [@ts1], [@ts2]]
43 my %t1; # t_id => [$loc, ref, code, FPKM, cov, covlen, $exd]
44 my %t2; # t_id => [$loc, ref, code, FPKM, cov, covlen, $exd]
45 # 0 1 2 3 4 5 6
46 open(DFILE, $gfexp) || die("Error opening file $gfexp\n");
47
48 while (<DFILE>) {
49 next unless m/^XLOC/;
50 chomp;
51 my @t=split(/\t/);
52 #next unless $t[-1] eq 'yes';
53 my $is_significant = ($t[-1] eq 'yes');
54 my ($chr, $cstart, $cend)=($t[2]=~m/^([^\:]+):(\d+)\-(\d+)/);
55 ($cstart, $cend)=($cend,$cstart) if $cend<$cstart;
56 die("Error: no chromosome coordinates parsed ($_)!\n") unless $cstart && $cend;
57 $locdiff{$t[0]}=[$chr, $cstart, $cend, $t[6], $t[7], $t[10], $is_significant];
58 }
59 close(DFILE);
60 die("Error: no gene diff data loaded from $lfile\n") unless keys(%locdiff)>0;
61 print STDERR "Loading $gfexp file..\n";
62
63 open(LFILE, $lfile) || die("Error opening file $lfile!\n");
64 print STDERR "Loading loci file..\n";
65 while (<LFILE>) {
66 chomp;
67 my @t=split(/\t/);
68 my ($chr, $strand, $cstart, $cend)=
69 ($t[1]=~m/^([^\[]+)\[([\+\-\.])\](\d+)\-(\d+)/);
70 my @refs=split(/\,/,$t[2]) if $t[2] ne '-';
71 my @ts1=split(/\,/,$t[3]) if $t[3] ne '-';
72 my @ts2=split(/\,/,$t[4]) if $t[4] ne '-';
73 $loci{$t[0]}=[$chr, $strand, $cstart, $cend, [@refs], [@ts1], [@ts2]];
74 }
75 close(LFILE);
76 print STDERR "Loading tracking file..\n";
77 open(KFILE, $kfile) || die("Error opening file $kfile!\n");
78 while (<KFILE>) {
79 chomp;
80 my @t=split(/\t/);
81 #next unless ($t[2] eq '-' && $t[3] ne 'r') || $t[3]=~tr/.p\-ucj=eoi//;
82 next if $t[3]=~tr/xs//; # tr/xsr//
83 $t[2]='' if $t[2]=~m/^\s*\-\s*$/; # reference transcript
84 unless ($t[4] eq '-') {
85 my @q=split(/\|/,$t[4]);
86 $t1{$q[1]}=[$t[1], $t[2], $t[3], $q[3], $q[6], 0, []];
87 }
88 unless ($t[5] eq '-') {
89 my @q=split(/\|/,$t[5]);
90 $t2{$q[1]}=[$t[1], $t[2], $t[3], $q[3], $q[6], 0, []];
91 }
92 if ($t[2]) {
93 # update ref list in loci as sometimes it misses some o: and i: genes (!)
94 my $ld=$loci{$t[1]};
95 die("Error: locus $t[1] not found in the loci file but found in tracking file!\n") unless $ld;
96 my %g;
97 if (@{$ld->[4]}>0) {
98 @g{@{$ld->[4]}}=();
99 }
100 $g{$t[2]}=1;
101 $ld->[4]=[(keys(%g))];
102 }
103 }
104 close(KFILE);
105
106 print STDERR "Loading transcript file 1 ..\n";
107 open(TFILE, $tfile1) || die("Error opening file $tfile1!\n");
108 my $t; #current transcript
109 while (<TFILE>) {
110 chomp;
111 if (m/^>(\S+)\s+([^:]+):(\d+)\.\.(\d+)/) {
112 my ($tid, $chr, $cstart, $cend)=($1,$2,$3,$4);
113 my $td=$t1{$tid};
114 next unless $td;
115 my ($covlen, $numexons, @ex);
116 while (<TFILE>) {
117 chomp;
118 if (m/^\d+\-/) {
119 @ex=map { [split(/\-/)] } (split(/\,/));
120 $numexons=scalar(@ex);
121 map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
122 next;
123 }
124 if (m/^i:/) {
125 my ($fpkm)=(m/FPKM=([^;]+)/);
126 my ($cov)=(m/cov=([^;]+)/);
127 ($$td[3], $$td[4], $$td[5], $$td[6])=
128 ($fpkm, $cov, $covlen, [@ex]);
129 # -- debug:
130 #print STDERR "..saving ".scalar(@ex)." exons \n";
131 #my ($loc, $ref, $code, $fpk, $cc, $covl, $x)=@$td;
132 #my $numex=scalar(@$x);
133 #die "$tid: $loc $ref\[$code\] cov=$cc covlen=$covl numexons=$numex\n";
134 #--
135 last;
136 }
137 } #while record body
138 } #header found
139 }
140 close(TFILE);
141 print STDERR "Loading transcript file 2 ..\n";
142 open(TFILE, $tfile2) || die("Error opening file $tfile2!\n");
143 while (<TFILE>) {
144 if (m/^>(\S+)\s+([^:]+):(\d+)\.\.(\d+)/) {
145 my ($tid, $chr, $cstart, $cend)=($1,$2,$3,$4);
146 my $td=$t2{$tid};
147 next unless $td;
148 my ($covlen, $numexons, @ex);
149 while (<TFILE>) {
150 chomp;
151 if (m/^\d+\-/) {
152 @ex=map { [split(/\-/)] } (split(/\,/));
153 $numexons=scalar(@ex);
154 map { $covlen+=$_->[1]-$_[0]+1 } @ex;
155 next;
156 }
157 if (m/^i:/) {
158 my ($fpkm)=(m/FPKM=([^;]+)/);
159 my ($cov)=(m/cov=([^;]+)/);
160 ($$td[3], $$td[4], $$td[5], $$td[6])=
161 ($fpkm, $cov, $covlen, [@ex]);
162 last;
163 }
164 } #while record body
165 } #header found
166 }
167 close(TFILE);
168 print STDERR "Now compute coverage and print results.\n";
169 my $maxcoverage=0;
170 my $max_fpkm=0;
171 my ($maxcov_loc, $maxfpkm_loc);
172 # now finally check each locus
173 while (my ($l,$ld)=each(%loci)) {
174 my ($chr, $strand, $cstart, $cend, $rrefs, $rt1, $rt2)=@$ld;
175 my @refs=@$rrefs;
176 # check if this "prediction" is confirmed by cuffdiff
177 my $tlist1 = (@$rt1>0) ? join(',',@$rt1) : '-';
178 my $tlist2 = (@$rt2>0) ? join(',',@$rt2) : '-';
179 #merge all exons of @ts1
180 #my $debug=("$chr:$cstart-$cend" eq 'chr3:41288638-41289701');
181 my ($mcov1, $mcovlen1, $maxcovlen1, $maxexons1, $maxfpkm1, $dcode1)=getAvgCov($rt1, \%t1);
182 my ($mcov2, $mcovlen2, $maxcovlen2, $maxexons2, $maxfpkm2, $dcode2)=getAvgCov($rt2, \%t2);
183 ($maxcoverage, $maxcov_loc)=($mcov1,$l) if $mcov1>$maxcoverage;
184 ($maxcoverage, $maxcov_loc)=($mcov2,$l) if $mcov2>$maxcoverage;
185 ($max_fpkm, $maxfpkm_loc)=($maxfpkm1,$l) if $maxfpkm1>$max_fpkm;
186 ($max_fpkm, $maxfpkm_loc)=($maxfpkm2,$l) if $maxfpkm2>$max_fpkm;
187 next if ($maxcovlen1==0 && $maxcovlen2==0);
188 my $dcode=$dcode1.$dcode2;
189 my $refcode;
190 if (length($dcode)==2 && $dcode1 eq $dcode2 && $dcode1 ne '.') {
191 $refcode=$dcode1.':'; # prefix for reference column
192 }
193 elsif (length($dcode)==1 && $dcode ne '.') {
194 $refcode=$dcode.':';
195 }
196 #print STDERR "$l has uniform refcode: $refcode\n" if $refcode;
197 my $reflist = (@refs>0) ? join(',',@refs) : '-';
198 #-- discard single exon "novel" genes for now
199 # (though they could be some relevant non-coding RNAs..)
200 next if ($reflist eq '-' || $strand eq '.') && $maxexons1<2 && $maxexons2<2;
201 my $status;
202 my $pnovel=($reflist eq '-' || $refcode eq 'i:' || $refcode eq 'p:'); # has potential to be a novel gene
203 my $isnovel=0;
204 #my ($fpkm1, $fpkm2, $pval)=('-','-','-');
205 #if ($pnovel && ($mcov1+$mcov2>=$MIN_COV)) {
206 # $status='novel';
207 # $isnovel=1;
208 # }
209
210 # if not "novel", discard it unless
211 # cuffdiff found a significant change in expression level
212 # in a region that is consistent with this locus
213 my $diffd=$locdiff{$l};
214 my ($dchr, $dstart, $dend, $fpkm1, $fpkm2, $pval, $is_significant);
215 unless ($diffd) {
216 #print STDERR "Warning: $l not found in $gfexp file, skipping..\n";
217 #next;
218 ($fpkm1, $fpkm2)=($maxfpkm1, $maxfpkm2); # may not be reliable..
219 next if ($fpkm1+$fpkm2<$MIN_FPKM && $mcov1+$mcov2<$MIN_COV);
220 if ($pnovel) {
221 $status='novel';
222 $isnovel=1;
223 }
224 }
225 else {
226 #next unless ($isnovel || $diffd);
227 #if ($diffd) {
228 ($dchr, $dstart, $dend, $fpkm1, $fpkm2, $pval, $is_significant)=@$diffd;
229 #($fpkm1, $fpkm2, $pval)=($v1,$v2, $dp);
230 if ($chr ne $dchr) {
231 print STDERR "Warning: $l not found on the same chromosome in $gfexp file?\n";
232 next;
233 }
234 if ($pnovel && ($is_significant || ($fpkm1+$fpkm2>=$MIN_FPKM && $mcov1+$mcov2>=$MIN_COV))) {
235 $status='novel';
236 $isnovel=1;
237 }
238 next unless ($isnovel || $is_significant);
239 my $loclen=($cend-$cstart+1);
240 my $dloclen=($dend-$dstart+1);
241 # check if the quantification interval can be trusted
242 if (!$isnovel && ($dloclen-$loclen>400 || $dend-$cend>200 || $cstart-$dstart>200)) {
243 # bad quantification interval
244 $is_significant=0;
245 #next if ($dloclen-$loclen>400 || $dend-$cend>200 || $cstart-$dstart>200);
246 }
247 }
248 my $libtag=$isnovel ? '-novel' : '-specific';
249 if ($tlist1 eq '-' && $fpkm1==0 && ($is_significant || ($fpkm2>=$MIN_FPKM && $mcov2>=$MIN_COV))) {
250 if ($bamfiles && !bamHasOvl($chr, $cstart, $cend, $rt2, \%t2, $bam[0])) {
251 $status=$labels[1].$libtag;
252 }
253 }
254 elsif ($tlist2 eq '-' && $fpkm2==0 && ($is_significant || ($fpkm1>=$MIN_FPKM && $mcov1>=$MIN_COV))) {
255 if ($bamfiles && !bamHasOvl($chr, $cstart, $cend, $rt1, \%t1, $bam[1])) {
256 $status=$labels[0].$libtag;
257 }
258 }
259 unless ($status) {
260 next if $refcode eq 'o:'; #discard dubious overlaps
261 if ($mcov1+$mcov2>0 && $fpkm1+$fpkm2>0) {
262 if ($is_significant) {
263 $status = ($fpkm1>$fpkm2) ? $labels[0].'-inc_cdiff' : $labels[1].'-inc_cdiff';
264 }
265 #else {
266 my $covratio=signedRatio($mcov1, $mcov2); #decrease ratio
267 my $covlenratio=signedRatio($mcovlen1,$mcovlen2); #decrease ratio, negative if there is an increase
268 if (($covlenratio>=2 && $covratio>=3) || (abs($covlenratio)<2 && $covratio>4)) {
269 updStat(\$status, $labels[0],'inc_cov') if $fpkm1>$fpkm2;
270 }
271 elsif (($covlenratio<=-2 && $covratio<=-3) || (abs($covlenratio)<2 && $covratio<-4)) {
272 updStat(\$status, $labels[1],'inc_cov') if $fpkm2>$fpkm1;
273 }
274 elsif (abs($covlenratio)<2) {
275 if ($fpkm1>2*$fpkm2 && $mcov1>$mcov2*2) { updStat(\$status, $labels[0],'inc_cov') }
276 elsif ($fpkm2>2*$fpkm1 && $mcov2>$mcov1*2) { updStat(\$status, $labels[1],'inc_cov')}
277 }
278 # }
279 }
280 }
281 next unless $status;
282
283 print join("\t",$chr, $strand, $cstart, $cend, "$chr\:$cstart-$cend", $l, $status,
284 $refcode.$reflist, join('|', $mcov1, $maxexons1, $maxfpkm1, $tlist1),
285 join('|', $mcov2, $maxexons2, $maxfpkm2, $tlist2), $fpkm1, $fpkm2, $pval)."\n";
286 } # for each locus
287 print STDERR "Max. coverage found: $maxcoverage for locus $maxcov_loc\n";
288 print STDERR "Max. FPKM found: $max_fpkm for locus $maxfpkm_loc\n";
289 # --
290 if ($outfile) {
291 select(STDOUT);
292 close(OUTF);
293 }
294
295 #************ Subroutines **************
296 sub updStat {
297 my ($rstatus, $lib, $tag)=@_;
298 if (length($$rstatus)==0) {
299 $$rstatus=$lib.'-'.$tag;
300 }
301 else {
302 $$rstatus.='-'.$tag;
303 }
304 }
305
306 sub getAvgCov {
307 my ($tlst, $th)=@_;
308 my $maxcovlen=0;
309 my $tcov=0;
310 my $tcovlen=0;
311 my $tcount=0;
312 my $maxexons=0;
313 my %codes;
314 my $domcode='';
315 my $maxcov=0;
316 my $maxfpkm=0;
317 foreach my $t (@$tlst) {
318 my $thd=$th->{$t};
319 next unless ($thd);
320 my ($loc, $ref, $code, $fpkm, $cov, $covlen, $ex)=@$thd;
321 $domcode='.' if $ref;
322 $codes{$code}++ unless $code=~tr/u.\-r//;
323 my $numexons=scalar(@$ex);
324 #die "$t: $loc $ref\[$code\] covlen=$covlen, exons=$numexons\n";
325
326 $maxcovlen=$covlen if ($maxcovlen<$covlen);
327 $maxexons=$numexons if $maxexons<$numexons;
328 $tcovlen+=$covlen;
329 $tcount++;
330 $maxcov=$cov if ($cov>$maxcov);
331 $maxfpkm=$fpkm if $fpkm>$maxfpkm;
332 $tcov+=$cov;
333 }
334 my @classes=keys(%codes);
335 $domcode=$classes[0] if (@classes==1);
336 #my ($mcov, $mcovlen)=($tcount==0) ? ( 0, 0 ) :
337 # (sprintf('%.3f', $tcov/$tcount), sprintf('%d', $tcovlen/$tcount));
338 my ($mcov, $mcovlen)=($tcount==0) ? ( 0, 0 ) : (sprintf('%.2f',$maxcov), $maxcovlen);
339 return ($mcov, $mcovlen, $maxcovlen, $maxexons, sprintf('%.2f',$maxfpkm), $domcode);
340 }
341
342 sub signedRatio {
343 # decrease ratio from $a to $b (negative if there is an increase)
344 my ($a,$b)=@_;
345 if ($a>$b) { return ($b==0)?1000 : $a/$b }
346 elsif ($a==$b) {
347 return ($a==0) ? 0 : 1
348 }
349 else { return ($a==0) ? -1000 : -$b/$a }
350 }
351
352 sub bamHasOvl {
353 my ($chr, $cstart, $cend, $tlst, $th, $fbam)=@_;
354
355 my $range=$chr.':'.$cstart.'-'.$cend;
356 #my $debug=($range eq 'chr3:41288638-41289701');
357 my @exons;
358 foreach my $t (@$tlst) {
359 my $thd=$th->{$t};
360 next unless $thd;
361 my ($loc, $ref, $code, $fpkm, $cov, $covlen, $ex)=@$thd;
362 push(@exons, @$ex);
363 } #gather exons from all transcripts in this locus
364 return 0 unless @exons>0;
365 my @sorted_exons= sort { $a->[0]<=>$b->[0] } @exons;
366 open(SAMPIPE, "samtools view $fbam $range |") || die ("Error opening samtools pipe ($!)\n");
367 while(<SAMPIPE>) {
368 my $samline=$_;
369 chomp;
370 my ($qname, $flags, $gseq, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual, @extra)=
371 split(/\t/);
372 # if ($strand) {
373 # my $mstrand= (($flags & 0x10)==0) ? '+' : '-';
374 # next if $mstrand ne $strand;
375 # }
376 #now extract the CIGAR segments
377 my @cigdata=($cigar=~m/(\d+[A-Z,=])/g);
378 my ($mstart, $mend);
379 my $hasOvl=0;
380 my $curpos=$pos;
381 $mstart=$pos;
382 foreach my $cd (@cigdata) {
383 my $code=chop($cd);
384 if ($code eq 'N') { #gap
385 #process previous interval
386 if ($mend && checkOverlap($mstart, $mend, \@sorted_exons)) {
387 $hasOvl=1;
388 last;
389 }
390 $mstart=$curpos+$cd;
391 $mend=undef;
392 next;
393 }
394 $mend=$curpos+$cd-1;
395 $curpos+=$cd;
396 }
397 unless ($hasOvl) { #check the last interval
398 if ($mend && checkOverlap($mstart, $mend, \@sorted_exons)) {
399 $hasOvl=1;
400 }
401 }
402 if ($hasOvl) {
403 close(SAMPIPE);
404 return 1;
405 }
406 } # while <SAMPIPE>
407 close(SAMPIPE);
408 return 0;
409 }
410
411
412
413 sub checkOverlap {
414 my ($a, $b, $rx)=@_;
415 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
416 foreach my $x (@$rx) {
417 #return (start<=d->end && end>=d->start)
418 return 1 if ($a<=$$x[1] && $b>=$$x[0]);
419 return 0 if $b<$$x[0]; #@$rx is sorted
420 }
421 }

Properties

Name Value
svn:executable *