1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
miRNA_cmp.pl [-o <output.tab>] [-d diffexp.tab] \ |
8 |
<sample1_cov.micov>[:<totalreads1>] <sample2_cov.micov>[:<totalreads2>] |
9 |
|
10 |
Compares the expression levels of known miRNAs in two experiments |
11 |
based on the the coverage files produced by mgbl_miRNA_cov.pl |
12 |
If -d option is given, that file will have entries with a significant |
13 |
p-value according to Fisher's exact test with Bonferroni correction. |
14 |
|
15 |
If total reads counts are not given, a "context total" is computed by |
16 |
summing up existing counts (coverage values) in each sample |
17 |
/; |
18 |
umask 0002; |
19 |
print STDERR "#Command line was:\n".$FindBin::Script. |
20 |
' '.join(' ',@ARGV)."\n"; |
21 |
getopts('d:o:') || die($usage."\n"); |
22 |
die($usage."\n") unless @ARGV==2; |
23 |
my $outfile=$Getopt::Std::opt_o; |
24 |
if ($outfile) { |
25 |
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
26 |
select(OUTF); |
27 |
} |
28 |
# -- |
29 |
my $dfile=$Getopt::Std::opt_d; |
30 |
if ($dfile) { |
31 |
open(FDIFF, '>'.$dfile) || die("Error creating output file $dfile\n"); |
32 |
} |
33 |
my ($fd1, $fd2)=@ARGV; |
34 |
my ($f1, $total1)=split(/[\:\,~]+/, $fd1); |
35 |
die("$usage File $f1 not found!\n") unless -f $f1; |
36 |
#die("$usage Total reads incorrect!\n") unless $total1>10; |
37 |
my ($f2, $total2)=split(/[\:\,~]+/, $fd2); |
38 |
die("$usage File $f2 not found!\n") unless -f $f2; |
39 |
#die("$usage Total reads incorrect!\n") unless $total2>10; |
40 |
if ($total1>0 && $total2>0) { |
41 |
print STDERR "Total reads for $f1: $total1\n"; |
42 |
print STDERR "Total reads for $f2: $total2\n"; |
43 |
} |
44 |
# -- Fisher's exact test - utility constants |
45 |
my $PI=atan2(0,-1); |
46 |
my $twoPI=2*$PI; |
47 |
my $pi_3=$PI/3; |
48 |
my @lnfcache=(0,0); |
49 |
#--- |
50 |
|
51 |
my %mi; # miRNA => [ cov_sample_1, cov_sample_2 ] |
52 |
open(FIN, $f1) || die("Error opening file $f1\n"); |
53 |
while (<FIN>) { |
54 |
chomp; |
55 |
my @t=split(/\t/); |
56 |
# n1 n2 foldchange normalized Fisher p |
57 |
$t[0]=lc($t[0]); |
58 |
$mi{$t[0]}=[$t[2], 0, 0, 0, 0]; |
59 |
} |
60 |
close(FIN); |
61 |
|
62 |
open(FIN, $f2) || die("Error opening file $f2\n"); |
63 |
while(<FIN>) { |
64 |
chomp; |
65 |
my @t=split(/\t/); |
66 |
$t[0]=lc($t[0]); |
67 |
my $d=$mi{$t[0]}; |
68 |
if ($d) { |
69 |
$$d[1]=$t[2]; |
70 |
} |
71 |
else { |
72 |
$mi{$t[0]}=[0, $t[2], 0, 0]; |
73 |
} |
74 |
} |
75 |
close(FIN); |
76 |
#print '#'.join("\t", 'microRNA', 'count1', 'count2', 'fold_change', 'nfold_change', 'FET_p')."\n"; |
77 |
if ($total1<10 && $total2<10) { #compute contextual totals here |
78 |
$total1=0; |
79 |
$total2=0; |
80 |
while (my ($mirna, $md)=each(%mi)) { |
81 |
# 0 1 2 3 4 |
82 |
my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md; |
83 |
next if $n1<2 && $n2<2; |
84 |
$total1+=$n1; |
85 |
$total2+=$n2; |
86 |
} |
87 |
print STDERR "Total reads given for sample 1: $total1\n"; |
88 |
print STDERR "Total reads given for sample 2: $total2\n"; |
89 |
} # compute totals if not given |
90 |
my $num_miRNAs=keys(%mi); |
91 |
print STDERR "$num_miRNAs total miRNAs seen.\n"; |
92 |
my $p_corrected=0.05/$num_miRNAs; |
93 |
print STDERR "Bonferroni-corrected p-value is 0.05/$num_miRNAs = $p_corrected\n"; |
94 |
print join("\t", '#totals:', 'placebo|'.$total1, 'statin|'.$total2, $p_corrected)."\n"; |
95 |
my $numexpr; |
96 |
my $numdiffp; # count of miRNAs with p<$p_corrected |
97 |
my $numdiffp0; # count of miRNAs with p<=0.001 |
98 |
my $numdiff; # count of miRNAs with p<=0.001 AND abs(foldchange)>1.5 |
99 |
my $t_ratio = $total1/$total2; |
100 |
|
101 |
while (my ($mirna, $md)=each(%mi)) { |
102 |
# 0 1 2 3 4 |
103 |
my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md; |
104 |
next if $n1<2 && $n2<2; |
105 |
my ($n1real, $n2real)=(int($n1),int($n2)); |
106 |
$n1=1 if $n1==0; |
107 |
$n2=1 if $n2==0; |
108 |
my ($nn1, $nn2)=($n1, $n2*$t_ratio); |
109 |
$fchange =($n1<$n2) ? '+'.sprintf('%.2fx',$n2/$n1) : '-'.sprintf('%.2fx', $n1/$n2); |
110 |
$nfchange = ($nn1<$nn2) ? '+'.sprintf('%.2fx',$nn2/$nn1) : '-'.sprintf('%.2fx', $nn1/$nn2); |
111 |
$$md[2]=$fchange; |
112 |
$$md[3]=$nfchange; |
113 |
my $nfc=$nfchange; |
114 |
$nfc=~tr/x+-//d; |
115 |
my ($a, $b, $c, $d)=($n1, $total1-$n1, $n2, $total2-$n2); |
116 |
$fisher=calcFET($a, $b, $c, $d); |
117 |
$$md[4]=$fisher; |
118 |
$numexpr++; |
119 |
my $diffprinted=0; |
120 |
#print join("\t", $mirna, $n1, $n2, $fchange, $nfchange, $fisher, $nfc)."\n"; |
121 |
|
122 |
print join("\t", $mirna, $n1real, $n2real, $nfchange, $fisher)."\n"; |
123 |
if ($fisher<=0.001 && $nfc>=1.5) { |
124 |
print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n"; |
125 |
$diffprinted=1; |
126 |
$numdiff++; |
127 |
} |
128 |
$numdiffp0++ if ($fisher<=0.001); |
129 |
if ($fisher<=$p_corrected) { |
130 |
$numdiffp++; |
131 |
print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n" |
132 |
if ($dfile && $diffprinted==0); |
133 |
} |
134 |
} |
135 |
|
136 |
print STDERR "$numexpr miRNAs with read count>=2 in either sample.\n"; |
137 |
print STDERR "$numdiffp0 with FET p-value<=0.001\n"; |
138 |
print STDERR "$numdiffp with FET p-value<=$p_corrected\n"; |
139 |
print STDERR "$numdiff with nfold change >1.5 and p-value<=0.001\n"; |
140 |
|
141 |
# -- |
142 |
if ($outfile) { |
143 |
select(STDOUT); |
144 |
close(OUTF); |
145 |
} |
146 |
if ($dfile) { |
147 |
close(FDIFF); |
148 |
} |
149 |
|
150 |
sub LnFactorial { |
151 |
my $n=shift; |
152 |
die "Bad args to log_factorial: $n" if $n < 0; |
153 |
my $ln_fact; |
154 |
if ($n<900) { |
155 |
return $lnfcache[$n] if defined $lnfcache[$n]; |
156 |
for (my $i = scalar(@lnfcache); $i <= $n; $i++) { |
157 |
$lnfcache[$i] = $lnfcache[$i-1] + log($i); |
158 |
} |
159 |
return $lnfcache[$n]; |
160 |
} |
161 |
else { |
162 |
# Gosper's approximation; from |
163 |
# http://mathworld.wolfram.com/StirlingsApproximation.html |
164 |
$ln_fact = 0.5 * log($twoPI*$n + $pi_3) + $n*log($n) - $n; |
165 |
} |
166 |
return $ln_fact; |
167 |
} |
168 |
|
169 |
|
170 |
# Compute the probability of getting this exact table |
171 |
# using the hypergeometric distribution |
172 |
sub ProbOneTable { |
173 |
my ($a , $b , $c, $d) = @_; |
174 |
my $n = $a + $b + $c + $d; |
175 |
my $LnNumerator = LnFactorial($a+$b)+ |
176 |
LnFactorial($c+$d)+ |
177 |
LnFactorial($a+$c)+ |
178 |
LnFactorial($b+$d); |
179 |
|
180 |
my $LnDenominator = LnFactorial($a) + |
181 |
LnFactorial($b) + |
182 |
LnFactorial($c) + |
183 |
LnFactorial($d) + |
184 |
LnFactorial($n); |
185 |
|
186 |
my $LnP = $LnNumerator - $LnDenominator; |
187 |
return exp($LnP); |
188 |
} |
189 |
|
190 |
# Compute the cumulative probability by adding up individual |
191 |
# probabilities |
192 |
sub calcFET { |
193 |
my ($a, $b, $c, $d) = @_; |
194 |
|
195 |
my $min; |
196 |
|
197 |
my $n = $a + $b + $c + $d; |
198 |
|
199 |
my $p = 0; |
200 |
$p += ProbOneTable($a, $b, $c, $d); |
201 |
if( ($a * $d) >= ($b * $c) ) { |
202 |
$min = ($c < $b) ? $c : $b; |
203 |
for(my $i = 0; $i < $min; $i++) { |
204 |
$p += ProbOneTable(++$a, --$b, --$c, ++$d); |
205 |
} |
206 |
} |
207 |
|
208 |
if ( ($a * $d) < ($b * $c) ) { |
209 |
$min = ($a < $d) ? $a : $d; |
210 |
for(my $i = 0; $i < $min; $i++) { |
211 |
$p += ProbOneTable(--$a, ++$b, ++$c, --$d); |
212 |
} |
213 |
} |
214 |
return $p; |
215 |
} |