ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/miRNA_cmp.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 2 months ago) by gpertea
File size: 6487 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 miRNA_cmp.pl [-o <output.tab>] [-d diffexp.tab] \
8 <sample1_cov.micov>[:<totalreads1>] <sample2_cov.micov>[:<totalreads2>]
9
10 Compares the expression levels of known miRNAs in two experiments
11 based on the the coverage files produced by mgbl_miRNA_cov.pl
12 If -d option is given, that file will have entries with a significant
13 p-value according to Fisher's exact test with Bonferroni correction.
14
15 If total reads counts are not given, a "context total" is computed by
16 summing up existing counts (coverage values) in each sample
17 /;
18 umask 0002;
19 print STDERR "#Command line was:\n".$FindBin::Script.
20 ' '.join(' ',@ARGV)."\n";
21 getopts('d:o:') || die($usage."\n");
22 die($usage."\n") unless @ARGV==2;
23 my $outfile=$Getopt::Std::opt_o;
24 if ($outfile) {
25 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
26 select(OUTF);
27 }
28 # --
29 my $dfile=$Getopt::Std::opt_d;
30 if ($dfile) {
31 open(FDIFF, '>'.$dfile) || die("Error creating output file $dfile\n");
32 }
33 my ($fd1, $fd2)=@ARGV;
34 my ($f1, $total1)=split(/[\:\,~]+/, $fd1);
35 die("$usage File $f1 not found!\n") unless -f $f1;
36 #die("$usage Total reads incorrect!\n") unless $total1>10;
37 my ($f2, $total2)=split(/[\:\,~]+/, $fd2);
38 die("$usage File $f2 not found!\n") unless -f $f2;
39 #die("$usage Total reads incorrect!\n") unless $total2>10;
40 if ($total1>0 && $total2>0) {
41 print STDERR "Total reads for $f1: $total1\n";
42 print STDERR "Total reads for $f2: $total2\n";
43 }
44 # -- Fisher's exact test - utility constants
45 my $PI=atan2(0,-1);
46 my $twoPI=2*$PI;
47 my $pi_3=$PI/3;
48 my @lnfcache=(0,0);
49 #---
50
51 my %mi; # miRNA => [ cov_sample_1, cov_sample_2 ]
52 open(FIN, $f1) || die("Error opening file $f1\n");
53 while (<FIN>) {
54 chomp;
55 my @t=split(/\t/);
56 # n1 n2 foldchange normalized Fisher p
57 $t[0]=lc($t[0]);
58 $mi{$t[0]}=[$t[2], 0, 0, 0, 0];
59 }
60 close(FIN);
61
62 open(FIN, $f2) || die("Error opening file $f2\n");
63 while(<FIN>) {
64 chomp;
65 my @t=split(/\t/);
66 $t[0]=lc($t[0]);
67 my $d=$mi{$t[0]};
68 if ($d) {
69 $$d[1]=$t[2];
70 }
71 else {
72 $mi{$t[0]}=[0, $t[2], 0, 0];
73 }
74 }
75 close(FIN);
76 #print '#'.join("\t", 'microRNA', 'count1', 'count2', 'fold_change', 'nfold_change', 'FET_p')."\n";
77 if ($total1<10 && $total2<10) { #compute contextual totals here
78 $total1=0;
79 $total2=0;
80 while (my ($mirna, $md)=each(%mi)) {
81 # 0 1 2 3 4
82 my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md;
83 next if $n1<2 && $n2<2;
84 $total1+=$n1;
85 $total2+=$n2;
86 }
87 print STDERR "Total reads given for sample 1: $total1\n";
88 print STDERR "Total reads given for sample 2: $total2\n";
89 } # compute totals if not given
90 my $num_miRNAs=keys(%mi);
91 print STDERR "$num_miRNAs total miRNAs seen.\n";
92 my $p_corrected=0.05/$num_miRNAs;
93 print STDERR "Bonferroni-corrected p-value is 0.05/$num_miRNAs = $p_corrected\n";
94 print join("\t", '#totals:', 'placebo|'.$total1, 'statin|'.$total2, $p_corrected)."\n";
95 my $numexpr;
96 my $numdiffp; # count of miRNAs with p<$p_corrected
97 my $numdiffp0; # count of miRNAs with p<=0.001
98 my $numdiff; # count of miRNAs with p<=0.001 AND abs(foldchange)>1.5
99 my $t_ratio = $total1/$total2;
100
101 while (my ($mirna, $md)=each(%mi)) {
102 # 0 1 2 3 4
103 my ($n1, $n2, $fchange, $nfchange, $fisher)=@$md;
104 next if $n1<2 && $n2<2;
105 my ($n1real, $n2real)=(int($n1),int($n2));
106 $n1=1 if $n1==0;
107 $n2=1 if $n2==0;
108 my ($nn1, $nn2)=($n1, $n2*$t_ratio);
109 $fchange =($n1<$n2) ? '+'.sprintf('%.2fx',$n2/$n1) : '-'.sprintf('%.2fx', $n1/$n2);
110 $nfchange = ($nn1<$nn2) ? '+'.sprintf('%.2fx',$nn2/$nn1) : '-'.sprintf('%.2fx', $nn1/$nn2);
111 $$md[2]=$fchange;
112 $$md[3]=$nfchange;
113 my $nfc=$nfchange;
114 $nfc=~tr/x+-//d;
115 my ($a, $b, $c, $d)=($n1, $total1-$n1, $n2, $total2-$n2);
116 $fisher=calcFET($a, $b, $c, $d);
117 $$md[4]=$fisher;
118 $numexpr++;
119 my $diffprinted=0;
120 #print join("\t", $mirna, $n1, $n2, $fchange, $nfchange, $fisher, $nfc)."\n";
121
122 print join("\t", $mirna, $n1real, $n2real, $nfchange, $fisher)."\n";
123 if ($fisher<=0.001 && $nfc>=1.5) {
124 print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n";
125 $diffprinted=1;
126 $numdiff++;
127 }
128 $numdiffp0++ if ($fisher<=0.001);
129 if ($fisher<=$p_corrected) {
130 $numdiffp++;
131 print FDIFF join("\t", $mirna, $n1real, $n2real, $fchange, $nfchange, $fisher, $nfc)."\n"
132 if ($dfile && $diffprinted==0);
133 }
134 }
135
136 print STDERR "$numexpr miRNAs with read count>=2 in either sample.\n";
137 print STDERR "$numdiffp0 with FET p-value<=0.001\n";
138 print STDERR "$numdiffp with FET p-value<=$p_corrected\n";
139 print STDERR "$numdiff with nfold change >1.5 and p-value<=0.001\n";
140
141 # --
142 if ($outfile) {
143 select(STDOUT);
144 close(OUTF);
145 }
146 if ($dfile) {
147 close(FDIFF);
148 }
149
150 sub LnFactorial {
151 my $n=shift;
152 die "Bad args to log_factorial: $n" if $n < 0;
153 my $ln_fact;
154 if ($n<900) {
155 return $lnfcache[$n] if defined $lnfcache[$n];
156 for (my $i = scalar(@lnfcache); $i <= $n; $i++) {
157 $lnfcache[$i] = $lnfcache[$i-1] + log($i);
158 }
159 return $lnfcache[$n];
160 }
161 else {
162 # Gosper's approximation; from
163 # http://mathworld.wolfram.com/StirlingsApproximation.html
164 $ln_fact = 0.5 * log($twoPI*$n + $pi_3) + $n*log($n) - $n;
165 }
166 return $ln_fact;
167 }
168
169
170 # Compute the probability of getting this exact table
171 # using the hypergeometric distribution
172 sub ProbOneTable {
173 my ($a , $b , $c, $d) = @_;
174 my $n = $a + $b + $c + $d;
175 my $LnNumerator = LnFactorial($a+$b)+
176 LnFactorial($c+$d)+
177 LnFactorial($a+$c)+
178 LnFactorial($b+$d);
179
180 my $LnDenominator = LnFactorial($a) +
181 LnFactorial($b) +
182 LnFactorial($c) +
183 LnFactorial($d) +
184 LnFactorial($n);
185
186 my $LnP = $LnNumerator - $LnDenominator;
187 return exp($LnP);
188 }
189
190 # Compute the cumulative probability by adding up individual
191 # probabilities
192 sub calcFET {
193 my ($a, $b, $c, $d) = @_;
194
195 my $min;
196
197 my $n = $a + $b + $c + $d;
198
199 my $p = 0;
200 $p += ProbOneTable($a, $b, $c, $d);
201 if( ($a * $d) >= ($b * $c) ) {
202 $min = ($c < $b) ? $c : $b;
203 for(my $i = 0; $i < $min; $i++) {
204 $p += ProbOneTable(++$a, --$b, --$c, ++$d);
205 }
206 }
207
208 if ( ($a * $d) < ($b * $c) ) {
209 $min = ($a < $d) ? $a : $d;
210 for(my $i = 0; $i < $min; $i++) {
211 $p += ProbOneTable(--$a, ++$b, ++$c, --$d);
212 }
213 }
214 return $p;
215 }

Properties

Name Value
svn:executable *