1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
use FindBin;use lib $FindBin::Bin; |
5 |
|
|
use Fisher qw(fishers_exact log_factorial); |
6 |
|
|
|
7 |
|
|
|
8 |
|
|
my $usage = q/Usage: |
9 |
|
|
fisher_exact_test.pl [-T] a1 b1 a2 b2 |
10 |
|
|
Computes Fisher's exact test p-values for 2x2 matrix: |
11 |
|
|
|
12 |
|
|
a1 b1 |
13 |
|
|
|
14 |
|
|
a2 b2 |
15 |
|
|
|
16 |
|
|
-T option will make the input values to be taken in terms of totals: |
17 |
|
|
|
18 |
|
|
t1 t2 n1 n2 |
19 |
|
|
|
20 |
|
|
..where n1 is a subset of t1 and n2 a subset of t2, i.e. |
21 |
|
|
t1=a1+b1 and t2=a2+b2 |
22 |
|
|
|
23 |
|
|
/; |
24 |
|
|
umask 0002; |
25 |
|
|
getopts('To:') || die($usage."\n"); |
26 |
|
|
die($usage." Error: exactly 4 values are needed (totals first)!\n") unless @ARGV==4; |
27 |
|
|
my $totals=$Getopt::Std::opt_T; |
28 |
|
|
my @fncache=(0,0); |
29 |
|
|
|
30 |
|
|
# -- needed by Audic-Calverie: calc_audic() |
31 |
|
|
my $MAXIT = 500; |
32 |
|
|
my $EPS = 3.0E-30; |
33 |
|
|
my $FPMIN = 1.0E-30; |
34 |
|
|
my @COF = ( 76.18009172947146, -86.50532032941677, |
35 |
|
|
24.01409824083091, -1.231739572450155, |
36 |
|
|
0.1208650973866179E-2,-0.5395239384953E-5 ); |
37 |
|
|
|
38 |
|
|
|
39 |
|
|
|
40 |
|
|
{ |
41 |
|
|
my ($a, $b , $c, $d)=@ARGV; |
42 |
|
|
if ($totals) { |
43 |
|
|
my ($t1, $t2, $n1,$n2)=@ARGV; |
44 |
|
|
die("Error: totals should be larger than parts!\n") |
45 |
|
|
unless $n1<$t1 && $n2<$t2; |
46 |
|
|
($a, $b , $c, $d)=($n1, $t1-$n1, $n2, $t2-$n2); |
47 |
|
|
print STDERR " a b c d = $a $b $c $d\n"; |
48 |
|
|
#my $ap=audic($t1, $t2, $n1, $n2); |
49 |
|
|
my ($ap, $apdir)=calc_audic($n1, $n2, $t1, $t2, 1); |
50 |
|
|
print ">>>> Audic & Claverie : $ap ($apdir)\n"; |
51 |
|
|
} |
52 |
|
|
my $FisherProb = ProbCTable($a, $b , $c, $d); |
53 |
|
|
print "-----> Fisher Probability: $FisherProb\n"; |
54 |
|
|
my $fet1=fishers_exact($a,$b,$c,$d); |
55 |
|
|
my $fet2=fishers_exact($a,$b,$c,$d, 1); |
56 |
|
|
print "Fisher.pm : one-tailed: $fet1\n"; |
57 |
|
|
print "Fisher.pm : two-tailed: $fet2\n"; |
58 |
|
|
exit; |
59 |
|
|
#--- rest is optional: |
60 |
|
|
my $a_b = $a + $b; |
61 |
|
|
my $c_d = $c + $d; |
62 |
|
|
my $a_c = $a + $c; |
63 |
|
|
my $b_d = $b + $d; |
64 |
|
|
my $total = $a + $b + $c + $d; |
65 |
|
|
my $p1 = $a/$a_b; |
66 |
|
|
my $p2 = $c/$c_d; |
67 |
|
|
my $deltap = abs($p1 - $p2); |
68 |
|
|
my $p = $a_c / $total; |
69 |
|
|
my $sd = sqrt($p * (1-$p) * (1/$a_b + 1/$c_d)); |
70 |
|
|
my $z = abs($p1 - $p2) / $sd; |
71 |
|
|
my $zp = abs(ltqnorm($FisherProb)); |
72 |
|
|
my $z95 = abs(ltqnorm(0.95)); |
73 |
|
|
|
74 |
|
|
my $z_ratio = $z / $zp; |
75 |
|
|
my $deltap95 = $deltap * ($z95 / $zp); |
76 |
|
|
my $ratio95 = $z95 / $zp; |
77 |
|
|
print "Deltap:$deltap\tProjectedDeltap95:$deltap95\tRatio:$ratio95\n\n"; |
78 |
|
|
|
79 |
|
|
print "$a\t$b\t$a_b\n$c\t$d\t$c_d\n". |
80 |
|
|
"$a_c\t$b_d\t$total\nFisherProb: $FisherProb <----\n" . |
81 |
|
|
"z:$z\nzp:$zp\nratio:$z_ratio\n$p1\t$p2\t$p\n\n"; |
82 |
|
|
|
83 |
|
|
my $p_this_table = ProbOneTable($a , $b , $c , $d); |
84 |
|
|
print "This one table p:$p_this_table\n"; |
85 |
|
|
|
86 |
|
|
my $mid_p = $FisherProb - $p_this_table/2; |
87 |
|
|
print "Mid-p:$mid_p\n\n"; |
88 |
|
|
} |
89 |
|
|
|
90 |
|
|
#================ SUBROUTINES ============ |
91 |
|
|
|
92 |
|
|
sub LnFactorial { |
93 |
|
|
my $n = shift; |
94 |
|
|
return $fncache[$n] if defined $fncache[$n]; |
95 |
|
|
return undef if $n < 0; |
96 |
|
|
for (my $i = scalar(@fncache); $i <= $n; $i++) { |
97 |
|
|
$fncache[$i] = $fncache[$i-1] + log($i); |
98 |
|
|
} |
99 |
|
|
|
100 |
|
|
return $fncache[$n]; |
101 |
|
|
} |
102 |
|
|
sub factorial { |
103 |
|
|
my $r=1; |
104 |
|
|
$r *= $_ foreach 2..$_[0]; |
105 |
|
|
return $r; |
106 |
|
|
} |
107 |
|
|
|
108 |
|
|
#compute Audic & Claverie probability |
109 |
|
|
sub audic { |
110 |
|
|
my ($n1, $n2, $x, $y)=@_; |
111 |
|
|
my $v=$y*log($n2/$n1)+log_factorial($x+$y)-log_factorial($x)-log_factorial($y)- |
112 |
|
|
($x+$y+1)*log(1+$n2/$n1); |
113 |
|
|
return exp($v); |
114 |
|
|
#my $nratio=$n2/$n1; |
115 |
|
|
#my $v=($nratio ** $y)*factorial($x+$y)/(factorial($x)*factorial($y)*((1+$nratio)**($x+$y+1))); |
116 |
|
|
|
117 |
|
|
} |
118 |
|
|
|
119 |
|
|
sub calc_audic { |
120 |
|
|
=pod |
121 |
|
|
|
122 |
|
|
=head2 calc_audic $x, $y, $Nx, $Ny, <$signedValue> |
123 |
|
|
|
124 |
|
|
Determines the statistical significance of the difference |
125 |
|
|
in tag count (expression) between two libraries. This |
126 |
|
|
function uses the method described by Audic and |
127 |
|
|
Claverie (1997). This method can be called on an |
128 |
|
|
instantiated object, as well as statically. |
129 |
|
|
|
130 |
|
|
B<Arguments> |
131 |
|
|
|
132 |
|
|
I<$x,$y> |
133 |
|
|
|
134 |
|
|
The number of tags in the x- and y-axis |
135 |
|
|
libraries, respectively. |
136 |
|
|
|
137 |
|
|
I<$Nx,$Ny> |
138 |
|
|
|
139 |
|
|
The total number of tags in the x- and y-axis |
140 |
|
|
libraries, respectively. |
141 |
|
|
|
142 |
|
|
I<$signedValue> (optional) |
143 |
|
|
|
144 |
|
|
A boolean value (>=1 is FALSE). If this flag is |
145 |
|
|
set to TRUE, downregulated comparisons will return |
146 |
|
|
both a p-value and either +1, -1, or 0 to indicate |
147 |
|
|
up/down/same-regulation (i.e. -1 if the expression |
148 |
|
|
ratio of tags in the x-axis library(s) is greater |
149 |
|
|
than that of the y-axis library(s)). This flag |
150 |
|
|
is FALSE by default. |
151 |
|
|
|
152 |
|
|
B<Returns> |
153 |
|
|
|
154 |
|
|
The p-value for the observation. A lower number is |
155 |
|
|
more significant. Typically, observations with |
156 |
|
|
p <= 0.05 are considered statistically significant. |
157 |
|
|
|
158 |
|
|
If $signedValue is set to TRUE, the function also |
159 |
|
|
returns a 0, -1 or +1 to indicate same/down/up-regulation. |
160 |
|
|
|
161 |
|
|
B<Usage> |
162 |
|
|
|
163 |
|
|
# the function is static, so it can be accessed directly |
164 |
|
|
my $p = calc_audic( 3, 10, 50000, 60000 ); |
165 |
|
|
|
166 |
|
|
# or: |
167 |
|
|
my ( $p, $sign ) = calc_audic( 3, 10, 50000, 60000, 1 ); |
168 |
|
|
if( $p <= 0.05 ) { |
169 |
|
|
if( $sign == +1 ) { print "Significantly upregulated.\n"; } |
170 |
|
|
if( $sign == -1 ) { print "Significantly downregulated.\n"; } |
171 |
|
|
if( $sign == 0 ) { die( "Same expression should never be significant!" ); } |
172 |
|
|
} |
173 |
|
|
|
174 |
|
|
=cut |
175 |
|
|
my $x = shift; |
176 |
|
|
if( !defined( $x ) ) { die( " calc_audic no arguments provided." ); } |
177 |
|
|
my $y = shift; |
178 |
|
|
my $M = shift; # cf n1 |
179 |
|
|
my $N = shift; # cf n2 |
180 |
|
|
my $bSign = shift || 0; |
181 |
|
|
|
182 |
|
|
my $p = $M / ( $M+$N ); |
183 |
|
|
|
184 |
|
|
my $thisproba = __betai( $x+1, $y+1, $p ); |
185 |
|
|
my $thisproba2 = __betai( $y+1, $x+1, 1.0-$p ); |
186 |
|
|
|
187 |
|
|
if( $bSign >= 1 ) { |
188 |
|
|
my $ratio1 = $x / $M; |
189 |
|
|
my $ratio2 = $y / $N; |
190 |
|
|
my $sign = 0; |
191 |
|
|
if( $ratio1 > $ratio2 ) { $sign = -1; } |
192 |
|
|
if( $ratio1 < $ratio2 ) { $sign = +1; } |
193 |
|
|
#return ( ( $thisproba < $thisproba2 ? ( 2*$thisproba ) : ( 2*$thisproba2 ) ), $sign ); |
194 |
|
|
return ( ($thisproba < $thisproba2) ? $thisproba : $thisproba2, $sign ); |
195 |
|
|
} |
196 |
|
|
|
197 |
|
|
# return ( $thisproba < $thisproba2 ? ( 2*$thisproba ) : ( 2*$thisproba2 ) ); |
198 |
|
|
return ( $thisproba < $thisproba2) ? $thisproba : $thisproba2 ; |
199 |
|
|
|
200 |
|
|
} |
201 |
|
|
|
202 |
|
|
################################### |
203 |
|
|
# Audic and Claverie C->Perl Port # |
204 |
|
|
################################### |
205 |
|
|
|
206 |
|
|
sub __gammln { |
207 |
|
|
|
208 |
|
|
my $xx = shift; |
209 |
|
|
|
210 |
|
|
my $x = $xx; |
211 |
|
|
my $y = $xx; |
212 |
|
|
|
213 |
|
|
my $tmp = $x + 5.5; |
214 |
|
|
$tmp -= ( $x + 0.5 ) * log( $tmp ); |
215 |
|
|
my $ser = 1.000000000190015; |
216 |
|
|
for( my $j = 0; $j <= 5; $j++ ) { $ser += $COF[$j] / ++$y; } |
217 |
|
|
|
218 |
|
|
return -$tmp + log( 2.5066282746310005 * $ser / $x ); |
219 |
|
|
|
220 |
|
|
} |
221 |
|
|
|
222 |
|
|
sub __betai { |
223 |
|
|
my $a = shift; |
224 |
|
|
my $b = shift; |
225 |
|
|
my $x = shift; |
226 |
|
|
|
227 |
|
|
if( $x < 0.0 || $x > 1.0 ) { |
228 |
|
|
die( "Bad x in routine betai." ); |
229 |
|
|
} |
230 |
|
|
|
231 |
|
|
my $bt; |
232 |
|
|
|
233 |
|
|
if( $x == 0.0 || $x == 1.0 ) { |
234 |
|
|
$bt = 0.0; |
235 |
|
|
} else { |
236 |
|
|
$bt = exp( __gammln( $a+$b ) - __gammln( $a ) - __gammln( $b ) + $a*log( $x ) + $b*log( 1.0-$x ) ); |
237 |
|
|
} |
238 |
|
|
|
239 |
|
|
if( $x < ( $a+1.0 )/( $a+$b+2.0 ) ) { |
240 |
|
|
return $bt * __betacf( $a, $b, $x ) / $a; |
241 |
|
|
} |
242 |
|
|
|
243 |
|
|
return 1.0 - $bt * __betacf( $b, $a, 1.0-$x ) / $b; |
244 |
|
|
|
245 |
|
|
} |
246 |
|
|
|
247 |
|
|
sub __fabs { |
248 |
|
|
my $x = shift; |
249 |
|
|
return ( $x < 0 ? -$x : $x ); |
250 |
|
|
} |
251 |
|
|
|
252 |
|
|
sub __betacf { |
253 |
|
|
|
254 |
|
|
my $a = shift; |
255 |
|
|
my $b = shift; |
256 |
|
|
my $x = shift; |
257 |
|
|
|
258 |
|
|
my $qab = $a + $b; |
259 |
|
|
my $qap = $a + 1.0; |
260 |
|
|
my $qam = $a - 1.0; |
261 |
|
|
my $c = 1.0; |
262 |
|
|
my $d = 1.0 - $qab * $x / $qap; |
263 |
|
|
|
264 |
|
|
if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; } |
265 |
|
|
|
266 |
|
|
$d = 1.0 / $d; # inverse d |
267 |
|
|
my $h = $d; |
268 |
|
|
my $m; |
269 |
|
|
for( $m = 1; $m <= $MAXIT; $m++ ) { |
270 |
|
|
my $m2 = 2 * $m; |
271 |
|
|
my $aa = $m * ( $b-$m ) * $x / ( ( $qam + $m2 ) * ( $a + $m2 ) ); |
272 |
|
|
|
273 |
|
|
$d = 1.0 + $aa*$d; |
274 |
|
|
if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; } |
275 |
|
|
|
276 |
|
|
$c = 1.0 + $aa/$c; |
277 |
|
|
if( __fabs( $c ) < $FPMIN ) { $c = $FPMIN; } |
278 |
|
|
|
279 |
|
|
$d = 1.0 / $d; # inverse d |
280 |
|
|
|
281 |
|
|
$h *= $d*$c; |
282 |
|
|
|
283 |
|
|
$aa = -($a+$m)*($qab+$m)*$x/(($a+$m2)*($qap+$m2)); |
284 |
|
|
|
285 |
|
|
$d = 1.0 + $aa * $d; |
286 |
|
|
if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; } |
287 |
|
|
|
288 |
|
|
$c = 1.0 + $aa / $c; |
289 |
|
|
if( __fabs( $c ) < $FPMIN ) { $c = $FPMIN; } |
290 |
|
|
|
291 |
|
|
$d = 1.0 / $d; # inverse d; |
292 |
|
|
|
293 |
|
|
my $del = $d*$c; |
294 |
|
|
$h *= $del; |
295 |
|
|
if( __fabs( $del-1.0 ) < $EPS ) { last; } |
296 |
|
|
} |
297 |
|
|
|
298 |
|
|
if( $m > $MAXIT ) { |
299 |
|
|
# die( "a or b too big, or MAXIT too small in __betacf" ); |
300 |
|
|
print STDERR "a($a) or b($b) too big, or MAXIT($MAXIT) too small in __betacf!\n"; |
301 |
|
|
} |
302 |
|
|
return $h; |
303 |
|
|
} |
304 |
|
|
|
305 |
|
|
|
306 |
|
|
|
307 |
|
|
# Compute the probability of getting this exact table |
308 |
|
|
# using the hypergeometric distribution |
309 |
|
|
sub ProbOneTable { |
310 |
|
|
my ($a , $b , $c, $d) = @_; |
311 |
|
|
my $n = $a + $b + $c + $d; |
312 |
|
|
my $LnNumerator = LnFactorial($a+$b)+ |
313 |
|
|
LnFactorial($c+$d)+ |
314 |
|
|
LnFactorial($a+$c)+ |
315 |
|
|
LnFactorial($b+$d); |
316 |
|
|
|
317 |
|
|
my $LnDenominator = LnFactorial($a) + |
318 |
|
|
LnFactorial($b) + |
319 |
|
|
LnFactorial($c) + |
320 |
|
|
LnFactorial($d) + |
321 |
|
|
LnFactorial($n); |
322 |
|
|
|
323 |
|
|
my $LnP = $LnNumerator - $LnDenominator; |
324 |
|
|
return exp($LnP); |
325 |
|
|
} |
326 |
|
|
|
327 |
|
|
# Compute the cumulative probability by adding up individual |
328 |
|
|
# probabilities |
329 |
|
|
sub ProbCTable { |
330 |
|
|
my ($a, $b, $c, $d) = @_; |
331 |
|
|
|
332 |
|
|
my $min; |
333 |
|
|
|
334 |
|
|
my $n = $a + $b + $c + $d; |
335 |
|
|
|
336 |
|
|
my $p = 0; |
337 |
|
|
$p += ProbOneTable($a, $b, $c, $d); |
338 |
|
|
if( ($a * $d) >= ($b * $c) ) { |
339 |
|
|
$min = ($c < $b) ? $c : $b; |
340 |
|
|
for(my $i = 0; $i < $min; $i++) { |
341 |
|
|
$p += ProbOneTable(++$a, --$b, --$c, ++$d); |
342 |
|
|
} |
343 |
|
|
} |
344 |
|
|
|
345 |
|
|
if ( ($a * $d) < ($b * $c) ) { |
346 |
|
|
$min = ($a < $d) ? $a : $d; |
347 |
|
|
for(my $i = 0; $i < $min; $i++) { |
348 |
|
|
$p += ProbOneTable(--$a, ++$b, ++$c, --$d); |
349 |
|
|
} |
350 |
|
|
} |
351 |
|
|
return $p; |
352 |
|
|
} |
353 |
|
|
|
354 |
|
|
# Lower tail quantile for standard normal distribution function. |
355 |
|
|
# |
356 |
|
|
# This function returns an approximation of the inverse cumulative |
357 |
|
|
# standard normal distribution function. I.e., given P, it returns |
358 |
|
|
# an approximation to the X satisfying P = Pr{Z <= X} where Z is a |
359 |
|
|
# random variable from the standard normal distribution. |
360 |
|
|
# |
361 |
|
|
# The algorithm uses a minimax approximation by rational functions |
362 |
|
|
# and the result has a relative error whose absolute value is less |
363 |
|
|
# than 1.15e-9. |
364 |
|
|
# |
365 |
|
|
# Author: Peter J. Acklam |
366 |
|
|
# Time-stamp: 2000-07-19 18:26:14 |
367 |
|
|
# E-mail: pjacklam@online.no |
368 |
|
|
# WWW URL: http://home.online.no/~pjacklam |
369 |
|
|
sub ltqnorm { |
370 |
|
|
my $p = shift; |
371 |
|
|
die "input argument must be in (0,1)\n" unless 0 < $p && $p < 1; |
372 |
|
|
|
373 |
|
|
# Coefficients in rational approximations. |
374 |
|
|
my @a = (-3.969683028665376e+01, 2.209460984245205e+02, |
375 |
|
|
-2.759285104469687e+02, 1.383577518672690e+02, |
376 |
|
|
-3.066479806614716e+01, 2.506628277459239e+00); |
377 |
|
|
my @b = (-5.447609879822406e+01, 1.615858368580409e+02, |
378 |
|
|
-1.556989798598866e+02, 6.680131188771972e+01, |
379 |
|
|
-1.328068155288572e+01 ); |
380 |
|
|
my @c = (-7.784894002430293e-03, -3.223964580411365e-01, |
381 |
|
|
-2.400758277161838e+00, -2.549732539343734e+00, |
382 |
|
|
4.374664141464968e+00, 2.938163982698783e+00); |
383 |
|
|
my @d = ( 7.784695709041462e-03, 3.224671290700398e-01, |
384 |
|
|
2.445134137142996e+00, 3.754408661907416e+00); |
385 |
|
|
|
386 |
|
|
# Define break-points. |
387 |
|
|
my $plow = 0.02425; |
388 |
|
|
my $phigh = 1 - $plow; |
389 |
|
|
|
390 |
|
|
# Rational approximation for lower region: |
391 |
|
|
if ( $p < $plow ) { |
392 |
|
|
my $q = sqrt(-2*log($p)); |
393 |
|
|
return ((((($c[0]*$q+$c[1])*$q+$c[2])*$q+$c[3])*$q+$c[4])*$q+$c[5]) / |
394 |
|
|
(((($d[0]*$q+$d[1])*$q+$d[2])*$q+$d[3])*$q+1); |
395 |
|
|
} |
396 |
|
|
|
397 |
|
|
# Rational approximation for upper region: |
398 |
|
|
if ( $phigh < $p ) { |
399 |
|
|
my $q = sqrt(-2*log(1-$p)); |
400 |
|
|
return -((((($c[0]*$q+$c[1])*$q+$c[2])*$q+$c[3])*$q+$c[4])*$q+$c[5]) / |
401 |
|
|
(((($d[0]*$q+$d[1])*$q+$d[2])*$q+$d[3])*$q+1); |
402 |
|
|
} |
403 |
|
|
|
404 |
|
|
# Rational approximation for central region: |
405 |
|
|
my $q = $p - 0.5; |
406 |
|
|
my $r = $q*$q; |
407 |
|
|
return ((((($a[0]*$r+$a[1])*$r+$a[2])*$r+$a[3])*$r+$a[4])*$r+$a[5])*$q / |
408 |
|
|
((((($b[0]*$r+$b[1])*$r+$b[2])*$r+$b[3])*$r+$b[4])*$r+1); |
409 |
|
|
} |