gclib/scripts/cuffdiff_loc_check.pl

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 cuffdiff_loc_check.pl [-c <min_cov>] [-f <min_fpkm>] [-l <labels>] [-b <bamfiles>] \
   <gene_exp.diff> <cuffcompare.loci> <cuffcompare.tracking> <transfrags1.ifa> <transfrags2.ifa>
 
 (assume that transfrags?.ifa files were created using gff2iit -XTA )
 If BAM files are provided for specificity validation, they must be indexed.
/;
umask 0002;
getopts('b:f:c:l:o:') || die($usage."\n");
my $outfile=$Getopt::Std::opt_o;
my $lnames=$Getopt::Std::opt_l;
my $bamfiles=$Getopt::Std::opt_b;
my $MIN_COV=$Getopt::Std::opt_c || 2;
my $MIN_FPKM=$Getopt::Std::opt_f || 2;
if ($outfile) {
  open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
  select(OUTF);
  }
  
my @labels=split(/[\,\s\;]/,$lnames) if $lnames;
my @bam=split(/[\,\s\;]/,$bamfiles) if $bamfiles;
foreach my $bfile (@bam) {
 die("Error: file $bfile not found!\n") unless -f $bfile;
 die("Error: bam index $bfile.bai not found!\n") unless -f $bfile.'.bai';
 }
unless ($labels[1]) {
  @labels=('Exp1','Exp2');
  }
# --
my ($gfexp, $lfile, $kfile, $tfile1, $tfile2)=@ARGV;
die($usage) unless $tfile2;
#                           0       1      2      3       4       5          6
my %locdiff; # XLOC_id => [chr, c_start, c_end, fpkm_1, fpkm_2, p-value, is_significant]
             # cuffdiff loci data

          #               0      1       2     3    4        5       6
my %loci; # locus_id => [chr, strand, start, end, [refs], [@ts1], [@ts2]]
my %t1; # t_id => [$loc, ref, code, FPKM, cov, covlen, $exd]
my %t2; # t_id => [$loc, ref, code, FPKM, cov, covlen, $exd]
        #           0    1      2    3      4      5     6
open(DFILE, $gfexp) || die("Error opening file $gfexp\n");

while (<DFILE>) {
 next unless m/^XLOC/;
 chomp;
 my @t=split(/\t/);
 #next unless $t[-1] eq 'yes';
 my $is_significant = ($t[-1] eq 'yes');
 my ($chr, $cstart, $cend)=($t[2]=~m/^([^\:]+):(\d+)\-(\d+)/);
 ($cstart, $cend)=($cend,$cstart) if $cend<$cstart;
 die("Error: no chromosome coordinates parsed ($_)!\n") unless $cstart && $cend;
 $locdiff{$t[0]}=[$chr, $cstart, $cend, $t[6], $t[7], $t[10], $is_significant];
 }
close(DFILE);
die("Error: no gene diff data loaded from $lfile\n") unless keys(%locdiff)>0;
print STDERR "Loading $gfexp file..\n";

open(LFILE, $lfile) || die("Error opening file $lfile!\n");
print STDERR "Loading loci file..\n";
while (<LFILE>) {
  chomp;
  my @t=split(/\t/);
  my ($chr, $strand, $cstart, $cend)=
    ($t[1]=~m/^([^\[]+)\[([\+\-\.])\](\d+)\-(\d+)/);
  my @refs=split(/\,/,$t[2]) if $t[2] ne '-';
  my @ts1=split(/\,/,$t[3]) if $t[3] ne '-';
  my @ts2=split(/\,/,$t[4]) if $t[4] ne '-';
  $loci{$t[0]}=[$chr, $strand, $cstart, $cend, [@refs], [@ts1], [@ts2]];
  }
close(LFILE);
print STDERR "Loading tracking file..\n";
open(KFILE, $kfile) || die("Error opening file $kfile!\n");
while (<KFILE>) {
  chomp;
  my @t=split(/\t/);
  #next unless ($t[2] eq '-' && $t[3] ne 'r') || $t[3]=~tr/.p\-ucj=eoi//; 
  next if $t[3]=~tr/xs//; # tr/xsr//
  $t[2]='' if $t[2]=~m/^\s*\-\s*$/; # reference transcript
  unless ($t[4] eq '-') {
     my @q=split(/\|/,$t[4]);
     $t1{$q[1]}=[$t[1], $t[2], $t[3], $q[3], $q[6], 0, []];
     }
  unless ($t[5] eq '-') {
     my @q=split(/\|/,$t[5]);
     $t2{$q[1]}=[$t[1], $t[2], $t[3], $q[3], $q[6], 0, []];
     }
  if ($t[2]) {
    # update ref list in loci as sometimes it misses some o: and i: genes (!)
    my $ld=$loci{$t[1]};
    die("Error: locus $t[1] not found in the loci file but found in tracking file!\n") unless $ld;
    my %g;
    if (@{$ld->[4]}>0) {
       @g{@{$ld->[4]}}=();
       }
    $g{$t[2]}=1;
    $ld->[4]=[(keys(%g))];
    }
  }
close(KFILE);

print STDERR "Loading transcript file 1 ..\n";
open(TFILE, $tfile1) || die("Error opening file $tfile1!\n");
my $t; #current transcript
while (<TFILE>) {
  chomp;
  if (m/^>(\S+)\s+([^:]+):(\d+)\.\.(\d+)/) {
    my ($tid, $chr, $cstart, $cend)=($1,$2,$3,$4);
    my $td=$t1{$tid};
    next unless $td;
    my ($covlen, $numexons, @ex);
    while (<TFILE>) {
      chomp;
      if (m/^\d+\-/) {
        @ex=map { [split(/\-/)] } (split(/\,/));
        $numexons=scalar(@ex);
        map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
        next;
        }
      if (m/^i:/) {
        my ($fpkm)=(m/FPKM=([^;]+)/);
        my ($cov)=(m/cov=([^;]+)/);
        ($$td[3], $$td[4], $$td[5], $$td[6])=
         ($fpkm,   $cov,   $covlen, [@ex]);
        # -- debug: 
        #print STDERR "..saving ".scalar(@ex)." exons \n";
        #my ($loc, $ref, $code, $fpk, $cc, $covl, $x)=@$td;
        #my $numex=scalar(@$x);
        #die "$tid: $loc $ref\[$code\] cov=$cc covlen=$covl numexons=$numex\n";
        #--
        last;
        }
      } #while record body
    } #header found
  }
close(TFILE);
print STDERR "Loading transcript file 2 ..\n";
open(TFILE, $tfile2) || die("Error opening file $tfile2!\n");
while (<TFILE>) {
  if (m/^>(\S+)\s+([^:]+):(\d+)\.\.(\d+)/) {
    my ($tid, $chr, $cstart, $cend)=($1,$2,$3,$4);
    my $td=$t2{$tid};
    next unless $td;
    my ($covlen, $numexons, @ex);
    while (<TFILE>) {
      chomp;
      if (m/^\d+\-/) {
        @ex=map { [split(/\-/)] } (split(/\,/));
        $numexons=scalar(@ex);
        map { $covlen+=$_->[1]-$_[0]+1 } @ex;
        next;
        }
      if (m/^i:/) {
        my ($fpkm)=(m/FPKM=([^;]+)/);
        my ($cov)=(m/cov=([^;]+)/);
        ($$td[3], $$td[4], $$td[5], $$td[6])=
         ($fpkm,   $cov,   $covlen, [@ex]);
        last;
        }
      } #while record body
    } #header found
  }
close(TFILE);
print STDERR "Now compute coverage and print results.\n";
my $maxcoverage=0;
my $max_fpkm=0;
my ($maxcov_loc, $maxfpkm_loc);
# now finally check each locus
while (my ($l,$ld)=each(%loci)) {
   my ($chr, $strand, $cstart, $cend, $rrefs, $rt1, $rt2)=@$ld;
   my @refs=@$rrefs;
   # check if this "prediction" is confirmed by cuffdiff
   my $tlist1 = (@$rt1>0) ? join(',',@$rt1) : '-';
   my $tlist2 = (@$rt2>0) ? join(',',@$rt2) : '-';
   #merge all exons of @ts1
   #my $debug=("$chr:$cstart-$cend" eq 'chr3:41288638-41289701');
   my ($mcov1, $mcovlen1, $maxcovlen1, $maxexons1, $maxfpkm1, $dcode1)=getAvgCov($rt1, \%t1);
   my ($mcov2, $mcovlen2, $maxcovlen2, $maxexons2, $maxfpkm2, $dcode2)=getAvgCov($rt2, \%t2);
   ($maxcoverage, $maxcov_loc)=($mcov1,$l) if $mcov1>$maxcoverage;
   ($maxcoverage, $maxcov_loc)=($mcov2,$l) if $mcov2>$maxcoverage;
   ($max_fpkm, $maxfpkm_loc)=($maxfpkm1,$l) if $maxfpkm1>$max_fpkm;
   ($max_fpkm, $maxfpkm_loc)=($maxfpkm2,$l) if $maxfpkm2>$max_fpkm;
   next if ($maxcovlen1==0 && $maxcovlen2==0);
   my $dcode=$dcode1.$dcode2;
   my $refcode;
   if (length($dcode)==2 && $dcode1 eq $dcode2 && $dcode1 ne '.') {
        $refcode=$dcode1.':'; # prefix for reference column
        }
       elsif (length($dcode)==1 && $dcode ne '.') {
        $refcode=$dcode.':';
        }
   #print STDERR "$l has uniform refcode: $refcode\n" if $refcode;
   my $reflist = (@refs>0) ? join(',',@refs) : '-';
   #-- discard single exon "novel" genes for now 
   #  (though they could be some relevant non-coding RNAs..)
   next if ($reflist eq '-' || $strand eq '.') && $maxexons1<2 && $maxexons2<2;
   my $status;
   my $pnovel=($reflist eq '-' || $refcode eq 'i:' || $refcode eq 'p:'); # has potential to be a novel gene
   my $isnovel=0;
   #my ($fpkm1, $fpkm2, $pval)=('-','-','-');
   #if ($pnovel && ($mcov1+$mcov2>=$MIN_COV)) {
   #   $status='novel';
   #   $isnovel=1;
   #   }

   # if not "novel", discard it unless
   # cuffdiff found a significant change in expression level
   # in a region that is consistent with this locus
   my $diffd=$locdiff{$l};
   my ($dchr, $dstart, $dend, $fpkm1, $fpkm2, $pval, $is_significant);
   unless ($diffd) {
      #print STDERR "Warning: $l not found in $gfexp file, skipping..\n";
      #next;
      ($fpkm1, $fpkm2)=($maxfpkm1, $maxfpkm2); # may not be reliable..
      next if ($fpkm1+$fpkm2<$MIN_FPKM && $mcov1+$mcov2<$MIN_COV);
      if ($pnovel) {
          $status='novel';
          $isnovel=1;
          }
      }
    else {
    #next unless ($isnovel || $diffd); 
    #if ($diffd) {
     ($dchr, $dstart, $dend, $fpkm1, $fpkm2, $pval, $is_significant)=@$diffd;
       #($fpkm1, $fpkm2, $pval)=($v1,$v2, $dp);
     if ($chr ne $dchr) {
         print STDERR "Warning: $l not found on the same chromosome in $gfexp file?\n";
         next;
         }
     if ($pnovel && ($is_significant || ($fpkm1+$fpkm2>=$MIN_FPKM && $mcov1+$mcov2>=$MIN_COV))) {
      $status='novel';
      $isnovel=1;
      }
     next unless ($isnovel || $is_significant);
     my $loclen=($cend-$cstart+1);
     my $dloclen=($dend-$dstart+1);
     # check if the quantification interval can be trusted
     if (!$isnovel && ($dloclen-$loclen>400 || $dend-$cend>200 || $cstart-$dstart>200)) {
         # bad quantification interval
         $is_significant=0;
         #next if ($dloclen-$loclen>400 || $dend-$cend>200 || $cstart-$dstart>200);
         }
     }
   my $libtag=$isnovel ? '-novel' : '-specific';
   if ($tlist1 eq '-' && $fpkm1==0 && ($is_significant || ($fpkm2>=$MIN_FPKM && $mcov2>=$MIN_COV))) {
        if ($bamfiles && !bamHasOvl($chr, $cstart, $cend, $rt2, \%t2, $bam[0])) {
            $status=$labels[1].$libtag;
            }
        }
       elsif ($tlist2 eq '-' && $fpkm2==0 && ($is_significant || ($fpkm1>=$MIN_FPKM && $mcov1>=$MIN_COV))) {
        if ($bamfiles && !bamHasOvl($chr, $cstart, $cend, $rt1, \%t1, $bam[1])) {
           $status=$labels[0].$libtag;
           }
        }
   unless ($status) {
      next if $refcode eq 'o:'; #discard dubious overlaps
      if ($mcov1+$mcov2>0 && $fpkm1+$fpkm2>0) {
        if ($is_significant) {
          $status = ($fpkm1>$fpkm2) ? $labels[0].'-inc_cdiff' : $labels[1].'-inc_cdiff';
          }
        #else {
          my $covratio=signedRatio($mcov1, $mcov2); #decrease ratio
          my $covlenratio=signedRatio($mcovlen1,$mcovlen2); #decrease ratio, negative if there is an increase
          if (($covlenratio>=2 && $covratio>=3) || (abs($covlenratio)<2 && $covratio>4)) {
            updStat(\$status, $labels[0],'inc_cov') if $fpkm1>$fpkm2;
            }
          elsif (($covlenratio<=-2 && $covratio<=-3) || (abs($covlenratio)<2 && $covratio<-4)) {
            updStat(\$status, $labels[1],'inc_cov') if $fpkm2>$fpkm1;
            }
          elsif (abs($covlenratio)<2) {
            if ($fpkm1>2*$fpkm2 && $mcov1>$mcov2*2) { updStat(\$status, $labels[0],'inc_cov') }
             elsif ($fpkm2>2*$fpkm1 && $mcov2>$mcov1*2) { updStat(\$status, $labels[1],'inc_cov')}
            }
        #  }
        }
      }
   next unless $status;
   
   print join("\t",$chr, $strand, $cstart, $cend, "$chr\:$cstart-$cend", $l, $status,
           $refcode.$reflist, join('|', $mcov1, $maxexons1, $maxfpkm1, $tlist1), 
                    join('|', $mcov2, $maxexons2, $maxfpkm2, $tlist2), $fpkm1, $fpkm2, $pval)."\n";
   } # for each locus
print STDERR "Max. coverage found: $maxcoverage for locus $maxcov_loc\n";
print STDERR "Max. FPKM found: $max_fpkm for locus $maxfpkm_loc\n";
# --
if ($outfile) {
 select(STDOUT);
 close(OUTF);
 }

#************ Subroutines **************
sub updStat {
 my ($rstatus, $lib, $tag)=@_;
 if (length($$rstatus)==0) {
    $$rstatus=$lib.'-'.$tag;
    }
   else {
    $$rstatus.='-'.$tag;
    }
}

sub getAvgCov {
 my ($tlst, $th)=@_;
 my $maxcovlen=0;
 my $tcov=0;
 my $tcovlen=0;
 my $tcount=0;
 my $maxexons=0;
 my %codes;
 my $domcode='';
 my $maxcov=0;
 my $maxfpkm=0;
 foreach my $t (@$tlst) {
   my $thd=$th->{$t};
   next unless ($thd);
   my ($loc, $ref, $code, $fpkm, $cov, $covlen, $ex)=@$thd;
   $domcode='.' if $ref;
   $codes{$code}++ unless $code=~tr/u.\-r//;
   my $numexons=scalar(@$ex);
   #die "$t: $loc $ref\[$code\] covlen=$covlen, exons=$numexons\n";
   
   $maxcovlen=$covlen if ($maxcovlen<$covlen);
   $maxexons=$numexons if $maxexons<$numexons;
   $tcovlen+=$covlen;
   $tcount++;
   $maxcov=$cov if ($cov>$maxcov);
   $maxfpkm=$fpkm if $fpkm>$maxfpkm;
   $tcov+=$cov;
   }
 my @classes=keys(%codes);
 $domcode=$classes[0] if (@classes==1);
 #my ($mcov, $mcovlen)=($tcount==0) ? ( 0, 0 ) :
 #       (sprintf('%.3f', $tcov/$tcount), sprintf('%d', $tcovlen/$tcount));
 my ($mcov, $mcovlen)=($tcount==0) ? ( 0, 0 ) : (sprintf('%.2f',$maxcov), $maxcovlen);
 return ($mcov, $mcovlen, $maxcovlen, $maxexons, sprintf('%.2f',$maxfpkm), $domcode);
}

sub signedRatio { 
 # decrease ratio from $a to $b (negative if there is an increase)
 my ($a,$b)=@_;
 if ($a>$b) { return ($b==0)?1000 : $a/$b }
   elsif ($a==$b) {
     return ($a==0) ? 0 : 1
     }
     else { return ($a==0) ? -1000 : -$b/$a }
}

sub bamHasOvl {
 my ($chr, $cstart, $cend, $tlst, $th, $fbam)=@_;
 
 my $range=$chr.':'.$cstart.'-'.$cend;
 #my $debug=($range eq 'chr3:41288638-41289701');
 my @exons;
 foreach my $t (@$tlst) {
   my $thd=$th->{$t};
   next unless $thd;
   my ($loc, $ref, $code, $fpkm, $cov, $covlen, $ex)=@$thd;
   push(@exons, @$ex);
   } #gather exons from all transcripts in this locus
 return 0 unless @exons>0;
 my @sorted_exons= sort { $a->[0]<=>$b->[0] } @exons;
 open(SAMPIPE, "samtools view $fbam $range |") || die ("Error opening samtools pipe ($!)\n");
 while(<SAMPIPE>) {
   my $samline=$_;
   chomp;
   my ($qname, $flags, $gseq, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual, @extra)=
      split(/\t/);
  #    if ($strand) {
  #       my $mstrand= (($flags & 0x10)==0) ? '+' : '-';
  #       next if $mstrand ne $strand;
  #       }
   #now extract the CIGAR segments
   my @cigdata=($cigar=~m/(\d+[A-Z,=])/g);
   my ($mstart, $mend);
   my $hasOvl=0;
   my $curpos=$pos;
   $mstart=$pos;
   foreach my $cd (@cigdata) {
     my $code=chop($cd);
     if ($code eq 'N') { #gap
        #process previous interval
        if ($mend && checkOverlap($mstart, $mend, \@sorted_exons)) {
           $hasOvl=1;
           last;
           }
        $mstart=$curpos+$cd;
        $mend=undef;
        next;
        }
     $mend=$curpos+$cd-1;
     $curpos+=$cd;
     }
   unless ($hasOvl) { #check the last interval
     if ($mend && checkOverlap($mstart, $mend, \@sorted_exons)) {
         $hasOvl=1;
         }
     }
   if ($hasOvl) {
     close(SAMPIPE);
     return 1;
     }
   } # while <SAMPIPE>
  close(SAMPIPE);
  return 0; 
}


sub checkOverlap {
 my ($a, $b, $rx)=@_;
 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the whole exon chain
 foreach my $x (@$rx) {
   #return (start<=d->end && end>=d->start)
   return 1 if ($a<=$$x[1] && $b>=$$x[0]);
   return 0 if $b<$$x[0]; #@$rx is sorted
   }
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size:	14591 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	cuffdiff_loc_check.pl [-c <min_cov>] [-f <min_fpkm>] [-l <labels>] [-b <bamfiles>] \
8	<gene_exp.diff> <cuffcompare.loci> <cuffcompare.tracking> <transfrags1.ifa> <transfrags2.ifa>
9
10	(assume that transfrags?.ifa files were created using gff2iit -XTA )
11	If BAM files are provided for specificity validation, they must be indexed.
12	/;
13	umask 0002;
14	getopts('b:f:c:l:o:') \|\| die($usage."\n");
15	my $outfile=$Getopt::Std::opt_o;
16	my $lnames=$Getopt::Std::opt_l;
17	my $bamfiles=$Getopt::Std::opt_b;
18	my $MIN_COV=$Getopt::Std::opt_c \|\| 2;
19	my $MIN_FPKM=$Getopt::Std::opt_f \|\| 2;
20	if ($outfile) {
21	open(OUTF, '>'.$outfile) \|\| die("Error creating output file $outfile\n");
22	select(OUTF);
23	}
24
25	my @labels=split(/[\,\s\;]/,$lnames) if $lnames;
26	my @bam=split(/[\,\s\;]/,$bamfiles) if $bamfiles;
27	foreach my $bfile (@bam) {
28	die("Error: file $bfile not found!\n") unless -f $bfile;
29	die("Error: bam index $bfile.bai not found!\n") unless -f $bfile.'.bai';
30	}
31	unless ($labels[1]) {
32	@labels=('Exp1','Exp2');
33	}
34	# --
35	my ($gfexp, $lfile, $kfile, $tfile1, $tfile2)=@ARGV;
36	die($usage) unless $tfile2;
37	# 0 1 2 3 4 5 6
38	my %locdiff; # XLOC_id => [chr, c_start, c_end, fpkm_1, fpkm_2, p-value, is_significant]
39	# cuffdiff loci data
40
41	# 0 1 2 3 4 5 6
42	my %loci; # locus_id => [chr, strand, start, end, [refs], [@ts1], [@ts2]]
43	my %t1; # t_id => [$loc, ref, code, FPKM, cov, covlen, $exd]
44	my %t2; # t_id => [$loc, ref, code, FPKM, cov, covlen, $exd]
45	# 0 1 2 3 4 5 6
46	open(DFILE, $gfexp) \|\| die("Error opening file $gfexp\n");
47
48	while (<DFILE>) {
49	next unless m/^XLOC/;
50	chomp;
51	my @t=split(/\t/);
52	#next unless $t[-1] eq 'yes';
53	my $is_significant = ($t[-1] eq 'yes');
54	my ($chr, $cstart, $cend)=($t[2]=~m/^([^\:]+):(\d+)\-(\d+)/);
55	($cstart, $cend)=($cend,$cstart) if $cend<$cstart;
56	die("Error: no chromosome coordinates parsed ($_)!\n") unless $cstart && $cend;
57	$locdiff{$t[0]}=[$chr, $cstart, $cend, $t[6], $t[7], $t[10], $is_significant];
58	}
59	close(DFILE);
60	die("Error: no gene diff data loaded from $lfile\n") unless keys(%locdiff)>0;
61	print STDERR "Loading $gfexp file..\n";
62
63	open(LFILE, $lfile) \|\| die("Error opening file $lfile!\n");
64	print STDERR "Loading loci file..\n";
65	while (<LFILE>) {
66	chomp;
67	my @t=split(/\t/);
68	my ($chr, $strand, $cstart, $cend)=
69	($t[1]=~m/^([^\[]+)\[([\+\-\.])\](\d+)\-(\d+)/);
70	my @refs=split(/\,/,$t[2]) if $t[2] ne '-';
71	my @ts1=split(/\,/,$t[3]) if $t[3] ne '-';
72	my @ts2=split(/\,/,$t[4]) if $t[4] ne '-';
73	$loci{$t[0]}=[$chr, $strand, $cstart, $cend, [@refs], [@ts1], [@ts2]];
74	}
75	close(LFILE);
76	print STDERR "Loading tracking file..\n";
77	open(KFILE, $kfile) \|\| die("Error opening file $kfile!\n");
78	while (<KFILE>) {
79	chomp;
80	my @t=split(/\t/);
81	#next unless ($t[2] eq '-' && $t[3] ne 'r') \|\| $t[3]=~tr/.p\-ucj=eoi//;
82	next if $t[3]=~tr/xs//; # tr/xsr//
83	$t[2]='' if $t[2]=~m/^\s\-\s$/; # reference transcript
84	unless ($t[4] eq '-') {
85	my @q=split(/\\|/,$t[4]);
86	$t1{$q[1]}=[$t[1], $t[2], $t[3], $q[3], $q[6], 0, []];
87	}
88	unless ($t[5] eq '-') {
89	my @q=split(/\\|/,$t[5]);
90	$t2{$q[1]}=[$t[1], $t[2], $t[3], $q[3], $q[6], 0, []];
91	}
92	if ($t[2]) {
93	# update ref list in loci as sometimes it misses some o: and i: genes (!)
94	my $ld=$loci{$t[1]};
95	die("Error: locus $t[1] not found in the loci file but found in tracking file!\n") unless $ld;
96	my %g;
97	if (@{$ld->[4]}>0) {
98	@g{@{$ld->[4]}}=();
99	}
100	$g{$t[2]}=1;
101	$ld->[4]=[(keys(%g))];
102	}
103	}
104	close(KFILE);
105
106	print STDERR "Loading transcript file 1 ..\n";
107	open(TFILE, $tfile1) \|\| die("Error opening file $tfile1!\n");
108	my $t; #current transcript
109	while (<TFILE>) {
110	chomp;
111	if (m/^>(\S+)\s+([^:]+):(\d+)\.\.(\d+)/) {
112	my ($tid, $chr, $cstart, $cend)=($1,$2,$3,$4);
113	my $td=$t1{$tid};
114	next unless $td;
115	my ($covlen, $numexons, @ex);
116	while (<TFILE>) {
117	chomp;
118	if (m/^\d+\-/) {
119	@ex=map { [split(/\-/)] } (split(/\,/));
120	$numexons=scalar(@ex);
121	map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
122	next;
123	}
124	if (m/^i:/) {
125	my ($fpkm)=(m/FPKM=([^;]+)/);
126	my ($cov)=(m/cov=([^;]+)/);
127	($$td[3], $$td[4], $$td[5], $$td[6])=
128	($fpkm, $cov, $covlen, [@ex]);
129	# -- debug:
130	#print STDERR "..saving ".scalar(@ex)." exons \n";
131	#my ($loc, $ref, $code, $fpk, $cc, $covl, $x)=@$td;
132	#my $numex=scalar(@$x);
133	#die "$tid: $loc $ref\[$code\] cov=$cc covlen=$covl numexons=$numex\n";
134	#--
135	last;
136	}
137	} #while record body
138	} #header found
139	}
140	close(TFILE);
141	print STDERR "Loading transcript file 2 ..\n";
142	open(TFILE, $tfile2) \|\| die("Error opening file $tfile2!\n");
143	while (<TFILE>) {
144	if (m/^>(\S+)\s+([^:]+):(\d+)\.\.(\d+)/) {
145	my ($tid, $chr, $cstart, $cend)=($1,$2,$3,$4);
146	my $td=$t2{$tid};
147	next unless $td;
148	my ($covlen, $numexons, @ex);
149	while (<TFILE>) {
150	chomp;
151	if (m/^\d+\-/) {
152	@ex=map { [split(/\-/)] } (split(/\,/));
153	$numexons=scalar(@ex);
154	map { $covlen+=$_->[1]-$_[0]+1 } @ex;
155	next;
156	}
157	if (m/^i:/) {
158	my ($fpkm)=(m/FPKM=([^;]+)/);
159	my ($cov)=(m/cov=([^;]+)/);
160	($$td[3], $$td[4], $$td[5], $$td[6])=
161	($fpkm, $cov, $covlen, [@ex]);
162	last;
163	}
164	} #while record body
165	} #header found
166	}
167	close(TFILE);
168	print STDERR "Now compute coverage and print results.\n";
169	my $maxcoverage=0;
170	my $max_fpkm=0;
171	my ($maxcov_loc, $maxfpkm_loc);
172	# now finally check each locus
173	while (my ($l,$ld)=each(%loci)) {
174	my ($chr, $strand, $cstart, $cend, $rrefs, $rt1, $rt2)=@$ld;
175	my @refs=@$rrefs;
176	# check if this "prediction" is confirmed by cuffdiff
177	my $tlist1 = (@$rt1>0) ? join(',',@$rt1) : '-';
178	my $tlist2 = (@$rt2>0) ? join(',',@$rt2) : '-';
179	#merge all exons of @ts1
180	#my $debug=("$chr:$cstart-$cend" eq 'chr3:41288638-41289701');
181	my ($mcov1, $mcovlen1, $maxcovlen1, $maxexons1, $maxfpkm1, $dcode1)=getAvgCov($rt1, \%t1);
182	my ($mcov2, $mcovlen2, $maxcovlen2, $maxexons2, $maxfpkm2, $dcode2)=getAvgCov($rt2, \%t2);
183	($maxcoverage, $maxcov_loc)=($mcov1,$l) if $mcov1>$maxcoverage;
184	($maxcoverage, $maxcov_loc)=($mcov2,$l) if $mcov2>$maxcoverage;
185	($max_fpkm, $maxfpkm_loc)=($maxfpkm1,$l) if $maxfpkm1>$max_fpkm;
186	($max_fpkm, $maxfpkm_loc)=($maxfpkm2,$l) if $maxfpkm2>$max_fpkm;
187	next if ($maxcovlen1==0 && $maxcovlen2==0);
188	my $dcode=$dcode1.$dcode2;
189	my $refcode;
190	if (length($dcode)==2 && $dcode1 eq $dcode2 && $dcode1 ne '.') {
191	$refcode=$dcode1.':'; # prefix for reference column
192	}
193	elsif (length($dcode)==1 && $dcode ne '.') {
194	$refcode=$dcode.':';
195	}
196	#print STDERR "$l has uniform refcode: $refcode\n" if $refcode;
197	my $reflist = (@refs>0) ? join(',',@refs) : '-';
198	#-- discard single exon "novel" genes for now
199	# (though they could be some relevant non-coding RNAs..)
200	next if ($reflist eq '-' \|\| $strand eq '.') && $maxexons1<2 && $maxexons2<2;
201	my $status;
202	my $pnovel=($reflist eq '-' \|\| $refcode eq 'i:' \|\| $refcode eq 'p:'); # has potential to be a novel gene
203	my $isnovel=0;
204	#my ($fpkm1, $fpkm2, $pval)=('-','-','-');
205	#if ($pnovel && ($mcov1+$mcov2>=$MIN_COV)) {
206	# $status='novel';
207	# $isnovel=1;
208	# }
209
210	# if not "novel", discard it unless
211	# cuffdiff found a significant change in expression level
212	# in a region that is consistent with this locus
213	my $diffd=$locdiff{$l};
214	my ($dchr, $dstart, $dend, $fpkm1, $fpkm2, $pval, $is_significant);
215	unless ($diffd) {
216	#print STDERR "Warning: $l not found in $gfexp file, skipping..\n";
217	#next;
218	($fpkm1, $fpkm2)=($maxfpkm1, $maxfpkm2); # may not be reliable..
219	next if ($fpkm1+$fpkm2<$MIN_FPKM && $mcov1+$mcov2<$MIN_COV);
220	if ($pnovel) {
221	$status='novel';
222	$isnovel=1;
223	}
224	}
225	else {
226	#next unless ($isnovel \|\| $diffd);
227	#if ($diffd) {
228	($dchr, $dstart, $dend, $fpkm1, $fpkm2, $pval, $is_significant)=@$diffd;
229	#($fpkm1, $fpkm2, $pval)=($v1,$v2, $dp);
230	if ($chr ne $dchr) {
231	print STDERR "Warning: $l not found on the same chromosome in $gfexp file?\n";
232	next;
233	}
234	if ($pnovel && ($is_significant \|\| ($fpkm1+$fpkm2>=$MIN_FPKM && $mcov1+$mcov2>=$MIN_COV))) {
235	$status='novel';
236	$isnovel=1;
237	}
238	next unless ($isnovel \|\| $is_significant);
239	my $loclen=($cend-$cstart+1);
240	my $dloclen=($dend-$dstart+1);
241	# check if the quantification interval can be trusted
242	if (!$isnovel && ($dloclen-$loclen>400 \|\| $dend-$cend>200 \|\| $cstart-$dstart>200)) {
243	# bad quantification interval
244	$is_significant=0;
245	#next if ($dloclen-$loclen>400 \|\| $dend-$cend>200 \|\| $cstart-$dstart>200);
246	}
247	}
248	my $libtag=$isnovel ? '-novel' : '-specific';
249	if ($tlist1 eq '-' && $fpkm1==0 && ($is_significant \|\| ($fpkm2>=$MIN_FPKM && $mcov2>=$MIN_COV))) {
250	if ($bamfiles && !bamHasOvl($chr, $cstart, $cend, $rt2, \%t2, $bam[0])) {
251	$status=$labels[1].$libtag;
252	}
253	}
254	elsif ($tlist2 eq '-' && $fpkm2==0 && ($is_significant \|\| ($fpkm1>=$MIN_FPKM && $mcov1>=$MIN_COV))) {
255	if ($bamfiles && !bamHasOvl($chr, $cstart, $cend, $rt1, \%t1, $bam[1])) {
256	$status=$labels[0].$libtag;
257	}
258	}
259	unless ($status) {
260	next if $refcode eq 'o:'; #discard dubious overlaps
261	if ($mcov1+$mcov2>0 && $fpkm1+$fpkm2>0) {
262	if ($is_significant) {
263	$status = ($fpkm1>$fpkm2) ? $labels[0].'-inc_cdiff' : $labels[1].'-inc_cdiff';
264	}
265	#else {
266	my $covratio=signedRatio($mcov1, $mcov2); #decrease ratio
267	my $covlenratio=signedRatio($mcovlen1,$mcovlen2); #decrease ratio, negative if there is an increase
268	if (($covlenratio>=2 && $covratio>=3) \|\| (abs($covlenratio)<2 && $covratio>4)) {
269	updStat(\$status, $labels[0],'inc_cov') if $fpkm1>$fpkm2;
270	}
271	elsif (($covlenratio<=-2 && $covratio<=-3) \|\| (abs($covlenratio)<2 && $covratio<-4)) {
272	updStat(\$status, $labels[1],'inc_cov') if $fpkm2>$fpkm1;
273	}
274	elsif (abs($covlenratio)<2) {
275	if ($fpkm1>2$fpkm2 && $mcov1>$mcov22) { updStat(\$status, $labels[0],'inc_cov') }
276	elsif ($fpkm2>2$fpkm1 && $mcov2>$mcov12) { updStat(\$status, $labels[1],'inc_cov')}
277	}
278	# }
279	}
280	}
281	next unless $status;
282
283	print join("\t",$chr, $strand, $cstart, $cend, "$chr\:$cstart-$cend", $l, $status,
284	$refcode.$reflist, join('\|', $mcov1, $maxexons1, $maxfpkm1, $tlist1),
285	join('\|', $mcov2, $maxexons2, $maxfpkm2, $tlist2), $fpkm1, $fpkm2, $pval)."\n";
286	} # for each locus
287	print STDERR "Max. coverage found: $maxcoverage for locus $maxcov_loc\n";
288	print STDERR "Max. FPKM found: $max_fpkm for locus $maxfpkm_loc\n";
289	# --
290	if ($outfile) {
291	select(STDOUT);
292	close(OUTF);
293	}
294
295	#********** Subroutines ************
296	sub updStat {
297	my ($rstatus, $lib, $tag)=@_;
298	if (length($$rstatus)==0) {
299	$$rstatus=$lib.'-'.$tag;
300	}
301	else {
302	$$rstatus.='-'.$tag;
303	}
304	}
305
306	sub getAvgCov {
307	my ($tlst, $th)=@_;
308	my $maxcovlen=0;
309	my $tcov=0;
310	my $tcovlen=0;
311	my $tcount=0;
312	my $maxexons=0;
313	my %codes;
314	my $domcode='';
315	my $maxcov=0;
316	my $maxfpkm=0;
317	foreach my $t (@$tlst) {
318	my $thd=$th->{$t};
319	next unless ($thd);
320	my ($loc, $ref, $code, $fpkm, $cov, $covlen, $ex)=@$thd;
321	$domcode='.' if $ref;
322	$codes{$code}++ unless $code=~tr/u.\-r//;
323	my $numexons=scalar(@$ex);
324	#die "$t: $loc $ref\[$code\] covlen=$covlen, exons=$numexons\n";
325
326	$maxcovlen=$covlen if ($maxcovlen<$covlen);
327	$maxexons=$numexons if $maxexons<$numexons;
328	$tcovlen+=$covlen;
329	$tcount++;
330	$maxcov=$cov if ($cov>$maxcov);
331	$maxfpkm=$fpkm if $fpkm>$maxfpkm;
332	$tcov+=$cov;
333	}
334	my @classes=keys(%codes);
335	$domcode=$classes[0] if (@classes==1);
336	#my ($mcov, $mcovlen)=($tcount==0) ? ( 0, 0 ) :
337	# (sprintf('%.3f', $tcov/$tcount), sprintf('%d', $tcovlen/$tcount));
338	my ($mcov, $mcovlen)=($tcount==0) ? ( 0, 0 ) : (sprintf('%.2f',$maxcov), $maxcovlen);
339	return ($mcov, $mcovlen, $maxcovlen, $maxexons, sprintf('%.2f',$maxfpkm), $domcode);
340	}
341
342	sub signedRatio {
343	# decrease ratio from $a to $b (negative if there is an increase)
344	my ($a,$b)=@_;
345	if ($a>$b) { return ($b==0)?1000 : $a/$b }
346	elsif ($a==$b) {
347	return ($a==0) ? 0 : 1
348	}
349	else { return ($a==0) ? -1000 : -$b/$a }
350	}
351
352	sub bamHasOvl {
353	my ($chr, $cstart, $cend, $tlst, $th, $fbam)=@_;
354
355	my $range=$chr.':'.$cstart.'-'.$cend;
356	#my $debug=($range eq 'chr3:41288638-41289701');
357	my @exons;
358	foreach my $t (@$tlst) {
359	my $thd=$th->{$t};
360	next unless $thd;
361	my ($loc, $ref, $code, $fpkm, $cov, $covlen, $ex)=@$thd;
362	push(@exons, @$ex);
363	} #gather exons from all transcripts in this locus
364	return 0 unless @exons>0;
365	my @sorted_exons= sort { $a->[0]<=>$b->[0] } @exons;
366	open(SAMPIPE, "samtools view $fbam $range \|") \|\| die ("Error opening samtools pipe ($!)\n");
367	while(<SAMPIPE>) {
368	my $samline=$_;
369	chomp;
370	my ($qname, $flags, $gseq, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual, @extra)=
371	split(/\t/);
372	# if ($strand) {
373	# my $mstrand= (($flags & 0x10)==0) ? '+' : '-';
374	# next if $mstrand ne $strand;
375	# }
376	#now extract the CIGAR segments
377	my @cigdata=($cigar=~m/(\d+[A-Z,=])/g);
378	my ($mstart, $mend);
379	my $hasOvl=0;
380	my $curpos=$pos;
381	$mstart=$pos;
382	foreach my $cd (@cigdata) {
383	my $code=chop($cd);
384	if ($code eq 'N') { #gap
385	#process previous interval
386	if ($mend && checkOverlap($mstart, $mend, \@sorted_exons)) {
387	$hasOvl=1;
388	last;
389	}
390	$mstart=$curpos+$cd;
391	$mend=undef;
392	next;
393	}
394	$mend=$curpos+$cd-1;
395	$curpos+=$cd;
396	}
397	unless ($hasOvl) { #check the last interval
398	if ($mend && checkOverlap($mstart, $mend, \@sorted_exons)) {
399	$hasOvl=1;
400	}
401	}
402	if ($hasOvl) {
403	close(SAMPIPE);
404	return 1;
405	}
406	} # while <SAMPIPE>
407	close(SAMPIPE);
408	return 0;
409	}
410
411
412
413	sub checkOverlap {
414	my ($a, $b, $rx)=@_;
415	return 0 if ($a>$$rx[-1]->[1] \|\| $b<$$rx[0]->[0]); # not overlapping the whole exon chain
416	foreach my $x (@$rx) {
417	#return (start<=d->end && end>=d->start)
418	return 1 if ($a<=$$x[1] && $b>=$$x[0]);
419	return 0 if $b<$$x[0]; #@$rx is sorted
420	}
421	}