gclib/scripts/miRNA_cmp.pl

#!/usr/bin/perl
use strict;
use Getopt::Std;
use FindBin;use lib $FindBin::Bin;

my $usage = q/Usage:
 miRNA_cmp.pl [-o <output.tab>] [-d diffexp.tab] \
   <sample1_cov.micov>[:<totalreads1>] <sample2_cov.micov>[:<totalreads2>]
 
 Compares the expression levels of known miRNAs in two experiments
 based on the the coverage files produced by mgbl_miRNA_cov.pl
 If -d option is given, that file will have entries with a significant 
 p-value according to Fisher's exact test with Bonferroni correction.
 
 If total reads counts are not given, a "context total" is computed by 
 summing up existing counts (coverage values) in each sample
/;
umask 0002;
print STDERR "#Command line was:\n".$FindBin::Script.
              ' '.join(' ',@ARGV)."\n";
getopts('d:o:') || die($usage."\n");
die($usage."\n") unless @ARGV==2;
my $outfile=$Getopt::Std::opt_o;
if ($outfile) {
  open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
  select(OUTF);
  }
# --
my $dfile=$Getopt::Std::opt_d;
if ($dfile) {
  open(FDIFF, '>'.$dfile) || die("Error creating output file $dfile\n");
  }
my ($fd1, $fd2)=@ARGV;
my ($f1, $total1)=split(/[\:\,~]+/, $fd1);
die("$usage File $f1 not found!\n") unless -f $f1;
#die("$usage Total reads incorrect!\n") unless $total1>10;
my ($f2, $total2)=split(/[\:\,~]+/, $fd2);
die("$usage File $f2 not found!\n") unless -f $f2;
#die("$usage Total reads incorrect!\n") unless $total2>10;
if ($total1>0 && $total2>0) {
print STDERR "Total reads for $f1: $total1\n";
print STDERR "Total reads for $f2: $total2\n";
}
# -- Fisher's exact test - utility constants
my $PI=atan2(0,-1);
my $twoPI=2*$PI;
my $pi_3=$PI/3;
my @lnfcache=(0,0);
#---

my %mi; # miRNA => [ cov_sample_1, cov_sample_2 ]
open(FIN, $f1) || die("Error opening file $f1\n");
while (<FIN>) {
  chomp;
  my @t=split(/\t/);
  #            n1    n2  foldchange  normalized  Fisher p
  $t[0]=lc($t[0]);
  $mi{$t[0]}=[$t[2], 0,      0,       0,      0];
  }
close(FIN);

open(FIN, $f2) || die("Error opening file $f2\n");
while(<FIN>) {
  chomp;
  my @t=split(/\t/);
  $t[0]=lc($t[0]);
  my $d=$mi{$t[0]};
  if ($d) {
     $$d[1]=$t[2];
     }
    else {
     $mi{$t[0]}=[0, $t[2], 0, 0];
     }
  }
close(FIN);
#print '#'.join("\t", 'microRNA', 'count1', 'count2', 'fold_change', 'nfold_change', 'FET_p')."\n";
if ($total1<10 && $total2<10) { #compute contextual totals here
 $total1=0;
 $total2=0;
 while (my ($mirna, $md)=each(%mi)) {
   #      0   1      2          3         4       
   my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md;
   next if $n1<2 && $n2<2;
   $total1+=$n1;
   $total2+=$n2;
   }
 print STDERR "Total reads given for sample 1: $total1\n";
 print STDERR "Total reads given for sample 2: $total2\n";
 } # compute totals if not given
my $num_miRNAs=keys(%mi);
print STDERR "$num_miRNAs total miRNAs seen.\n";
my $p_corrected=0.05/$num_miRNAs;
print STDERR "Bonferroni-corrected p-value is 0.05/$num_miRNAs = $p_corrected\n";
print join("\t", '#totals:', 'placebo|'.$total1, 'statin|'.$total2, $p_corrected)."\n";
my $numexpr;
my $numdiffp; # count of miRNAs with p<$p_corrected
my $numdiffp0; # count of miRNAs with p<=0.001
my $numdiff; # count of miRNAs with p<=0.001 AND abs(foldchange)>1.5
my $t_ratio = $total1/$total2;

while (my ($mirna, $md)=each(%mi)) {
#      0   1      2          3         4       
 my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md;
 next if $n1<2 && $n2<2;
 my ($n1real, $n2real)=(int($n1),int($n2));
 $n1=1 if $n1==0;
 $n2=1 if $n2==0;
 my ($nn1, $nn2)=($n1, $n2*$t_ratio);
 $fchange =($n1<$n2) ? '+'.sprintf('%.2fx',$n2/$n1) : '-'.sprintf('%.2fx', $n1/$n2);
 $nfchange = ($nn1<$nn2) ? '+'.sprintf('%.2fx',$nn2/$nn1) : '-'.sprintf('%.2fx', $nn1/$nn2);
 $$md[2]=$fchange;
 $$md[3]=$nfchange;
 my $nfc=$nfchange;
 $nfc=~tr/x+-//d;
 my ($a, $b, $c, $d)=($n1, $total1-$n1, $n2, $total2-$n2);
 $fisher=calcFET($a, $b, $c, $d);
 $$md[4]=$fisher;
 $numexpr++;
 my $diffprinted=0;
 #print join("\t", $mirna, $n1, $n2, $fchange, $nfchange, $fisher, $nfc)."\n";
 
 print join("\t", $mirna, $n1real, $n2real, $nfchange, $fisher)."\n";
 if ($fisher<=0.001 && $nfc>=1.5) {
  print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n";
  $diffprinted=1;
  $numdiff++;
  }
 $numdiffp0++ if ($fisher<=0.001);
 if ($fisher<=$p_corrected) {
     $numdiffp++;
     print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n"
       if ($dfile && $diffprinted==0);
     }
}

print STDERR "$numexpr miRNAs with read count>=2 in either sample.\n";
print STDERR "$numdiffp0 with FET p-value<=0.001\n";
print STDERR "$numdiffp with FET p-value<=$p_corrected\n";
print STDERR "$numdiff with nfold change >1.5 and p-value<=0.001\n";

# --
if ($outfile) {
 select(STDOUT);
 close(OUTF);
 }
if ($dfile) {
 close(FDIFF);
 }

sub LnFactorial {
 my $n=shift;
 die "Bad args to log_factorial: $n" if $n < 0;
 my $ln_fact;
 if ($n<900) {
    return $lnfcache[$n] if defined $lnfcache[$n];
    for (my $i = scalar(@lnfcache); $i <= $n; $i++) {
       $lnfcache[$i] = $lnfcache[$i-1] + log($i);
       }
    return $lnfcache[$n];
    }
   else { 
    # Gosper's approximation; from
    # http://mathworld.wolfram.com/StirlingsApproximation.html
    $ln_fact = 0.5 * log($twoPI*$n + $pi_3) + $n*log($n) - $n;
    }
 return $ln_fact;
 }


# Compute the probability of getting this exact table
# using the hypergeometric distribution
sub ProbOneTable {
  my ($a , $b , $c, $d) = @_;
  my $n = $a + $b + $c + $d;
  my $LnNumerator     = LnFactorial($a+$b)+
                        LnFactorial($c+$d)+
                        LnFactorial($a+$c)+
                        LnFactorial($b+$d);

  my $LnDenominator   = LnFactorial($a) +
                        LnFactorial($b) +
                        LnFactorial($c) +
                        LnFactorial($d) +
                        LnFactorial($n);

  my $LnP = $LnNumerator - $LnDenominator;
  return exp($LnP);
}

# Compute the cumulative probability by adding up individual
# probabilities
sub calcFET {
  my ($a, $b, $c, $d) = @_;

  my $min;

  my $n = $a + $b + $c + $d;

  my $p = 0;
  $p += ProbOneTable($a, $b, $c, $d);
  if( ($a * $d) >= ($b * $c) ) {
    $min = ($c < $b) ? $c : $b;
    for(my $i = 0; $i < $min; $i++) {
      $p += ProbOneTable(++$a, --$b, --$c, ++$d);
    }
  }

  if ( ($a * $d) < ($b * $c) ) {
    $min = ($a < $d) ? $a : $d;
    for(my $i = 0; $i < $min; $i++) {
      $p += ProbOneTable(--$a, ++$b, ++$c, --$d);
    }
  }
  return $p;
}
Revision:	24
Committed:	Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size:	6487 byte(s)
Log Message:
Line	File contents
1	#!/usr/bin/perl
2	use strict;
3	use Getopt::Std;
4	use FindBin;use lib $FindBin::Bin;
5
6	my $usage = q/Usage:
7	miRNA_cmp.pl [-o <output.tab>] [-d diffexp.tab] \
8	<sample1_cov.micov>[:<totalreads1>] <sample2_cov.micov>[:<totalreads2>]
9
10	Compares the expression levels of known miRNAs in two experiments
11	based on the the coverage files produced by mgbl_miRNA_cov.pl
12	If -d option is given, that file will have entries with a significant
13	p-value according to Fisher's exact test with Bonferroni correction.
14
15	If total reads counts are not given, a "context total" is computed by
16	summing up existing counts (coverage values) in each sample
17	/;
18	umask 0002;
19	print STDERR "#Command line was:\n".$FindBin::Script.
20	' '.join(' ',@ARGV)."\n";
21	getopts('d:o:') \|\| die($usage."\n");
22	die($usage."\n") unless @ARGV==2;
23	my $outfile=$Getopt::Std::opt_o;
24	if ($outfile) {
25	open(OUTF, '>'.$outfile) \|\| die("Error creating output file $outfile\n");
26	select(OUTF);
27	}
28	# --
29	my $dfile=$Getopt::Std::opt_d;
30	if ($dfile) {
31	open(FDIFF, '>'.$dfile) \|\| die("Error creating output file $dfile\n");
32	}
33	my ($fd1, $fd2)=@ARGV;
34	my ($f1, $total1)=split(/[\:\,~]+/, $fd1);
35	die("$usage File $f1 not found!\n") unless -f $f1;
36	#die("$usage Total reads incorrect!\n") unless $total1>10;
37	my ($f2, $total2)=split(/[\:\,~]+/, $fd2);
38	die("$usage File $f2 not found!\n") unless -f $f2;
39	#die("$usage Total reads incorrect!\n") unless $total2>10;
40	if ($total1>0 && $total2>0) {
41	print STDERR "Total reads for $f1: $total1\n";
42	print STDERR "Total reads for $f2: $total2\n";
43	}
44	# -- Fisher's exact test - utility constants
45	my $PI=atan2(0,-1);
46	my $twoPI=2*$PI;
47	my $pi_3=$PI/3;
48	my @lnfcache=(0,0);
49	#---
50
51	my %mi; # miRNA => [ cov_sample_1, cov_sample_2 ]
52	open(FIN, $f1) \|\| die("Error opening file $f1\n");
53	while (<FIN>) {
54	chomp;
55	my @t=split(/\t/);
56	# n1 n2 foldchange normalized Fisher p
57	$t[0]=lc($t[0]);
58	$mi{$t[0]}=[$t[2], 0, 0, 0, 0];
59	}
60	close(FIN);
61
62	open(FIN, $f2) \|\| die("Error opening file $f2\n");
63	while(<FIN>) {
64	chomp;
65	my @t=split(/\t/);
66	$t[0]=lc($t[0]);
67	my $d=$mi{$t[0]};
68	if ($d) {
69	$$d[1]=$t[2];
70	}
71	else {
72	$mi{$t[0]}=[0, $t[2], 0, 0];
73	}
74	}
75	close(FIN);
76	#print '#'.join("\t", 'microRNA', 'count1', 'count2', 'fold_change', 'nfold_change', 'FET_p')."\n";
77	if ($total1<10 && $total2<10) { #compute contextual totals here
78	$total1=0;
79	$total2=0;
80	while (my ($mirna, $md)=each(%mi)) {
81	# 0 1 2 3 4
82	my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md;
83	next if $n1<2 && $n2<2;
84	$total1+=$n1;
85	$total2+=$n2;
86	}
87	print STDERR "Total reads given for sample 1: $total1\n";
88	print STDERR "Total reads given for sample 2: $total2\n";
89	} # compute totals if not given
90	my $num_miRNAs=keys(%mi);
91	print STDERR "$num_miRNAs total miRNAs seen.\n";
92	my $p_corrected=0.05/$num_miRNAs;
93	print STDERR "Bonferroni-corrected p-value is 0.05/$num_miRNAs = $p_corrected\n";
94	print join("\t", '#totals:', 'placebo\|'.$total1, 'statin\|'.$total2, $p_corrected)."\n";
95	my $numexpr;
96	my $numdiffp; # count of miRNAs with p<$p_corrected
97	my $numdiffp0; # count of miRNAs with p<=0.001
98	my $numdiff; # count of miRNAs with p<=0.001 AND abs(foldchange)>1.5
99	my $t_ratio = $total1/$total2;
100
101	while (my ($mirna, $md)=each(%mi)) {
102	# 0 1 2 3 4
103	my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md;
104	next if $n1<2 && $n2<2;
105	my ($n1real, $n2real)=(int($n1),int($n2));
106	$n1=1 if $n1==0;
107	$n2=1 if $n2==0;
108	my ($nn1, $nn2)=($n1, $n2*$t_ratio);
109	$fchange =($n1<$n2) ? '+'.sprintf('%.2fx',$n2/$n1) : '-'.sprintf('%.2fx', $n1/$n2);
110	$nfchange = ($nn1<$nn2) ? '+'.sprintf('%.2fx',$nn2/$nn1) : '-'.sprintf('%.2fx', $nn1/$nn2);
111	$$md[2]=$fchange;
112	$$md[3]=$nfchange;
113	my $nfc=$nfchange;
114	$nfc=~tr/x+-//d;
115	my ($a, $b, $c, $d)=($n1, $total1-$n1, $n2, $total2-$n2);
116	$fisher=calcFET($a, $b, $c, $d);
117	$$md[4]=$fisher;
118	$numexpr++;
119	my $diffprinted=0;
120	#print join("\t", $mirna, $n1, $n2, $fchange, $nfchange, $fisher, $nfc)."\n";
121
122	print join("\t", $mirna, $n1real, $n2real, $nfchange, $fisher)."\n";
123	if ($fisher<=0.001 && $nfc>=1.5) {
124	print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n";
125	$diffprinted=1;
126	$numdiff++;
127	}
128	$numdiffp0++ if ($fisher<=0.001);
129	if ($fisher<=$p_corrected) {
130	$numdiffp++;
131	print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n"
132	if ($dfile && $diffprinted==0);
133	}
134	}
135
136	print STDERR "$numexpr miRNAs with read count>=2 in either sample.\n";
137	print STDERR "$numdiffp0 with FET p-value<=0.001\n";
138	print STDERR "$numdiffp with FET p-value<=$p_corrected\n";
139	print STDERR "$numdiff with nfold change >1.5 and p-value<=0.001\n";
140
141	# --
142	if ($outfile) {
143	select(STDOUT);
144	close(OUTF);
145	}
146	if ($dfile) {
147	close(FDIFF);
148	}
149
150	sub LnFactorial {
151	my $n=shift;
152	die "Bad args to log_factorial: $n" if $n < 0;
153	my $ln_fact;
154	if ($n<900) {
155	return $lnfcache[$n] if defined $lnfcache[$n];
156	for (my $i = scalar(@lnfcache); $i <= $n; $i++) {
157	$lnfcache[$i] = $lnfcache[$i-1] + log($i);
158	}
159	return $lnfcache[$n];
160	}
161	else {
162	# Gosper's approximation; from
163	# http://mathworld.wolfram.com/StirlingsApproximation.html
164	$ln_fact = 0.5 * log($twoPI$n + $pi_3) + $nlog($n) - $n;
165	}
166	return $ln_fact;
167	}
168
169
170	# Compute the probability of getting this exact table
171	# using the hypergeometric distribution
172	sub ProbOneTable {
173	my ($a , $b , $c, $d) = @_;
174	my $n = $a + $b + $c + $d;
175	my $LnNumerator = LnFactorial($a+$b)+
176	LnFactorial($c+$d)+
177	LnFactorial($a+$c)+
178	LnFactorial($b+$d);
179
180	my $LnDenominator = LnFactorial($a) +
181	LnFactorial($b) +
182	LnFactorial($c) +
183	LnFactorial($d) +
184	LnFactorial($n);
185
186	my $LnP = $LnNumerator - $LnDenominator;
187	return exp($LnP);
188	}
189
190	# Compute the cumulative probability by adding up individual
191	# probabilities
192	sub calcFET {
193	my ($a, $b, $c, $d) = @_;
194
195	my $min;
196
197	my $n = $a + $b + $c + $d;
198
199	my $p = 0;
200	$p += ProbOneTable($a, $b, $c, $d);
201	if( ($a * $d) >= ($b * $c) ) {
202	$min = ($c < $b) ? $c : $b;
203	for(my $i = 0; $i < $min; $i++) {
204	$p += ProbOneTable(++$a, --$b, --$c, ++$d);
205	}
206	}
207
208	if ( ($a * $d) < ($b * $c) ) {
209	$min = ($a < $d) ? $a : $d;
210	for(my $i = 0; $i < $min; $i++) {
211	$p += ProbOneTable(--$a, ++$b, ++$c, --$d);
212	}
213	}
214	return $p;
215	}