ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fisher_exact_test.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years ago) by gpertea
File size: 11212 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5     use Fisher qw(fishers_exact log_factorial);
6    
7    
8     my $usage = q/Usage:
9     fisher_exact_test.pl [-T] a1 b1 a2 b2
10     Computes Fisher's exact test p-values for 2x2 matrix:
11    
12     a1 b1
13    
14     a2 b2
15    
16     -T option will make the input values to be taken in terms of totals:
17    
18     t1 t2 n1 n2
19    
20     ..where n1 is a subset of t1 and n2 a subset of t2, i.e.
21     t1=a1+b1 and t2=a2+b2
22    
23     /;
24     umask 0002;
25     getopts('To:') || die($usage."\n");
26     die($usage." Error: exactly 4 values are needed (totals first)!\n") unless @ARGV==4;
27     my $totals=$Getopt::Std::opt_T;
28     my @fncache=(0,0);
29    
30     # -- needed by Audic-Calverie: calc_audic()
31     my $MAXIT = 500;
32     my $EPS = 3.0E-30;
33     my $FPMIN = 1.0E-30;
34     my @COF = ( 76.18009172947146, -86.50532032941677,
35     24.01409824083091, -1.231739572450155,
36     0.1208650973866179E-2,-0.5395239384953E-5 );
37    
38    
39    
40     {
41     my ($a, $b , $c, $d)=@ARGV;
42     if ($totals) {
43     my ($t1, $t2, $n1,$n2)=@ARGV;
44     die("Error: totals should be larger than parts!\n")
45     unless $n1<$t1 && $n2<$t2;
46     ($a, $b , $c, $d)=($n1, $t1-$n1, $n2, $t2-$n2);
47     print STDERR " a b c d = $a $b $c $d\n";
48     #my $ap=audic($t1, $t2, $n1, $n2);
49     my ($ap, $apdir)=calc_audic($n1, $n2, $t1, $t2, 1);
50     print ">>>> Audic & Claverie : $ap ($apdir)\n";
51     }
52     my $FisherProb = ProbCTable($a, $b , $c, $d);
53     print "-----> Fisher Probability: $FisherProb\n";
54     my $fet1=fishers_exact($a,$b,$c,$d);
55     my $fet2=fishers_exact($a,$b,$c,$d, 1);
56     print "Fisher.pm : one-tailed: $fet1\n";
57     print "Fisher.pm : two-tailed: $fet2\n";
58     exit;
59     #--- rest is optional:
60     my $a_b = $a + $b;
61     my $c_d = $c + $d;
62     my $a_c = $a + $c;
63     my $b_d = $b + $d;
64     my $total = $a + $b + $c + $d;
65     my $p1 = $a/$a_b;
66     my $p2 = $c/$c_d;
67     my $deltap = abs($p1 - $p2);
68     my $p = $a_c / $total;
69     my $sd = sqrt($p * (1-$p) * (1/$a_b + 1/$c_d));
70     my $z = abs($p1 - $p2) / $sd;
71     my $zp = abs(ltqnorm($FisherProb));
72     my $z95 = abs(ltqnorm(0.95));
73    
74     my $z_ratio = $z / $zp;
75     my $deltap95 = $deltap * ($z95 / $zp);
76     my $ratio95 = $z95 / $zp;
77     print "Deltap:$deltap\tProjectedDeltap95:$deltap95\tRatio:$ratio95\n\n";
78    
79     print "$a\t$b\t$a_b\n$c\t$d\t$c_d\n".
80     "$a_c\t$b_d\t$total\nFisherProb: $FisherProb <----\n" .
81     "z:$z\nzp:$zp\nratio:$z_ratio\n$p1\t$p2\t$p\n\n";
82    
83     my $p_this_table = ProbOneTable($a , $b , $c , $d);
84     print "This one table p:$p_this_table\n";
85    
86     my $mid_p = $FisherProb - $p_this_table/2;
87     print "Mid-p:$mid_p\n\n";
88     }
89    
90     #================ SUBROUTINES ============
91    
92     sub LnFactorial {
93     my $n = shift;
94     return $fncache[$n] if defined $fncache[$n];
95     return undef if $n < 0;
96     for (my $i = scalar(@fncache); $i <= $n; $i++) {
97     $fncache[$i] = $fncache[$i-1] + log($i);
98     }
99    
100     return $fncache[$n];
101     }
102     sub factorial {
103     my $r=1;
104     $r *= $_ foreach 2..$_[0];
105     return $r;
106     }
107    
108     #compute Audic & Claverie probability
109     sub audic {
110     my ($n1, $n2, $x, $y)=@_;
111     my $v=$y*log($n2/$n1)+log_factorial($x+$y)-log_factorial($x)-log_factorial($y)-
112     ($x+$y+1)*log(1+$n2/$n1);
113     return exp($v);
114     #my $nratio=$n2/$n1;
115     #my $v=($nratio ** $y)*factorial($x+$y)/(factorial($x)*factorial($y)*((1+$nratio)**($x+$y+1)));
116    
117     }
118    
119     sub calc_audic {
120     =pod
121    
122     =head2 calc_audic $x, $y, $Nx, $Ny, <$signedValue>
123    
124     Determines the statistical significance of the difference
125     in tag count (expression) between two libraries. This
126     function uses the method described by Audic and
127     Claverie (1997). This method can be called on an
128     instantiated object, as well as statically.
129    
130     B<Arguments>
131    
132     I<$x,$y>
133    
134     The number of tags in the x- and y-axis
135     libraries, respectively.
136    
137     I<$Nx,$Ny>
138    
139     The total number of tags in the x- and y-axis
140     libraries, respectively.
141    
142     I<$signedValue> (optional)
143    
144     A boolean value (>=1 is FALSE). If this flag is
145     set to TRUE, downregulated comparisons will return
146     both a p-value and either +1, -1, or 0 to indicate
147     up/down/same-regulation (i.e. -1 if the expression
148     ratio of tags in the x-axis library(s) is greater
149     than that of the y-axis library(s)). This flag
150     is FALSE by default.
151    
152     B<Returns>
153    
154     The p-value for the observation. A lower number is
155     more significant. Typically, observations with
156     p <= 0.05 are considered statistically significant.
157    
158     If $signedValue is set to TRUE, the function also
159     returns a 0, -1 or +1 to indicate same/down/up-regulation.
160    
161     B<Usage>
162    
163     # the function is static, so it can be accessed directly
164     my $p = calc_audic( 3, 10, 50000, 60000 );
165    
166     # or:
167     my ( $p, $sign ) = calc_audic( 3, 10, 50000, 60000, 1 );
168     if( $p <= 0.05 ) {
169     if( $sign == +1 ) { print "Significantly upregulated.\n"; }
170     if( $sign == -1 ) { print "Significantly downregulated.\n"; }
171     if( $sign == 0 ) { die( "Same expression should never be significant!" ); }
172     }
173    
174     =cut
175     my $x = shift;
176     if( !defined( $x ) ) { die( " calc_audic no arguments provided." ); }
177     my $y = shift;
178     my $M = shift; # cf n1
179     my $N = shift; # cf n2
180     my $bSign = shift || 0;
181    
182     my $p = $M / ( $M+$N );
183    
184     my $thisproba = __betai( $x+1, $y+1, $p );
185     my $thisproba2 = __betai( $y+1, $x+1, 1.0-$p );
186    
187     if( $bSign >= 1 ) {
188     my $ratio1 = $x / $M;
189     my $ratio2 = $y / $N;
190     my $sign = 0;
191     if( $ratio1 > $ratio2 ) { $sign = -1; }
192     if( $ratio1 < $ratio2 ) { $sign = +1; }
193     #return ( ( $thisproba < $thisproba2 ? ( 2*$thisproba ) : ( 2*$thisproba2 ) ), $sign );
194     return ( ($thisproba < $thisproba2) ? $thisproba : $thisproba2, $sign );
195     }
196    
197     # return ( $thisproba < $thisproba2 ? ( 2*$thisproba ) : ( 2*$thisproba2 ) );
198     return ( $thisproba < $thisproba2) ? $thisproba : $thisproba2 ;
199    
200     }
201    
202     ###################################
203     # Audic and Claverie C->Perl Port #
204     ###################################
205    
206     sub __gammln {
207    
208     my $xx = shift;
209    
210     my $x = $xx;
211     my $y = $xx;
212    
213     my $tmp = $x + 5.5;
214     $tmp -= ( $x + 0.5 ) * log( $tmp );
215     my $ser = 1.000000000190015;
216     for( my $j = 0; $j <= 5; $j++ ) { $ser += $COF[$j] / ++$y; }
217    
218     return -$tmp + log( 2.5066282746310005 * $ser / $x );
219    
220     }
221    
222     sub __betai {
223     my $a = shift;
224     my $b = shift;
225     my $x = shift;
226    
227     if( $x < 0.0 || $x > 1.0 ) {
228     die( "Bad x in routine betai." );
229     }
230    
231     my $bt;
232    
233     if( $x == 0.0 || $x == 1.0 ) {
234     $bt = 0.0;
235     } else {
236     $bt = exp( __gammln( $a+$b ) - __gammln( $a ) - __gammln( $b ) + $a*log( $x ) + $b*log( 1.0-$x ) );
237     }
238    
239     if( $x < ( $a+1.0 )/( $a+$b+2.0 ) ) {
240     return $bt * __betacf( $a, $b, $x ) / $a;
241     }
242    
243     return 1.0 - $bt * __betacf( $b, $a, 1.0-$x ) / $b;
244    
245     }
246    
247     sub __fabs {
248     my $x = shift;
249     return ( $x < 0 ? -$x : $x );
250     }
251    
252     sub __betacf {
253    
254     my $a = shift;
255     my $b = shift;
256     my $x = shift;
257    
258     my $qab = $a + $b;
259     my $qap = $a + 1.0;
260     my $qam = $a - 1.0;
261     my $c = 1.0;
262     my $d = 1.0 - $qab * $x / $qap;
263    
264     if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; }
265    
266     $d = 1.0 / $d; # inverse d
267     my $h = $d;
268     my $m;
269     for( $m = 1; $m <= $MAXIT; $m++ ) {
270     my $m2 = 2 * $m;
271     my $aa = $m * ( $b-$m ) * $x / ( ( $qam + $m2 ) * ( $a + $m2 ) );
272    
273     $d = 1.0 + $aa*$d;
274     if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; }
275    
276     $c = 1.0 + $aa/$c;
277     if( __fabs( $c ) < $FPMIN ) { $c = $FPMIN; }
278    
279     $d = 1.0 / $d; # inverse d
280    
281     $h *= $d*$c;
282    
283     $aa = -($a+$m)*($qab+$m)*$x/(($a+$m2)*($qap+$m2));
284    
285     $d = 1.0 + $aa * $d;
286     if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; }
287    
288     $c = 1.0 + $aa / $c;
289     if( __fabs( $c ) < $FPMIN ) { $c = $FPMIN; }
290    
291     $d = 1.0 / $d; # inverse d;
292    
293     my $del = $d*$c;
294     $h *= $del;
295     if( __fabs( $del-1.0 ) < $EPS ) { last; }
296     }
297    
298     if( $m > $MAXIT ) {
299     # die( "a or b too big, or MAXIT too small in __betacf" );
300     print STDERR "a($a) or b($b) too big, or MAXIT($MAXIT) too small in __betacf!\n";
301     }
302     return $h;
303     }
304    
305    
306    
307     # Compute the probability of getting this exact table
308     # using the hypergeometric distribution
309     sub ProbOneTable {
310     my ($a , $b , $c, $d) = @_;
311     my $n = $a + $b + $c + $d;
312     my $LnNumerator = LnFactorial($a+$b)+
313     LnFactorial($c+$d)+
314     LnFactorial($a+$c)+
315     LnFactorial($b+$d);
316    
317     my $LnDenominator = LnFactorial($a) +
318     LnFactorial($b) +
319     LnFactorial($c) +
320     LnFactorial($d) +
321     LnFactorial($n);
322    
323     my $LnP = $LnNumerator - $LnDenominator;
324     return exp($LnP);
325     }
326    
327     # Compute the cumulative probability by adding up individual
328     # probabilities
329     sub ProbCTable {
330     my ($a, $b, $c, $d) = @_;
331    
332     my $min;
333    
334     my $n = $a + $b + $c + $d;
335    
336     my $p = 0;
337     $p += ProbOneTable($a, $b, $c, $d);
338     if( ($a * $d) >= ($b * $c) ) {
339     $min = ($c < $b) ? $c : $b;
340     for(my $i = 0; $i < $min; $i++) {
341     $p += ProbOneTable(++$a, --$b, --$c, ++$d);
342     }
343     }
344    
345     if ( ($a * $d) < ($b * $c) ) {
346     $min = ($a < $d) ? $a : $d;
347     for(my $i = 0; $i < $min; $i++) {
348     $p += ProbOneTable(--$a, ++$b, ++$c, --$d);
349     }
350     }
351     return $p;
352     }
353    
354     # Lower tail quantile for standard normal distribution function.
355     #
356     # This function returns an approximation of the inverse cumulative
357     # standard normal distribution function. I.e., given P, it returns
358     # an approximation to the X satisfying P = Pr{Z <= X} where Z is a
359     # random variable from the standard normal distribution.
360     #
361     # The algorithm uses a minimax approximation by rational functions
362     # and the result has a relative error whose absolute value is less
363     # than 1.15e-9.
364     #
365     # Author: Peter J. Acklam
366     # Time-stamp: 2000-07-19 18:26:14
367     # E-mail: pjacklam@online.no
368     # WWW URL: http://home.online.no/~pjacklam
369     sub ltqnorm {
370     my $p = shift;
371     die "input argument must be in (0,1)\n" unless 0 < $p && $p < 1;
372    
373     # Coefficients in rational approximations.
374     my @a = (-3.969683028665376e+01, 2.209460984245205e+02,
375     -2.759285104469687e+02, 1.383577518672690e+02,
376     -3.066479806614716e+01, 2.506628277459239e+00);
377     my @b = (-5.447609879822406e+01, 1.615858368580409e+02,
378     -1.556989798598866e+02, 6.680131188771972e+01,
379     -1.328068155288572e+01 );
380     my @c = (-7.784894002430293e-03, -3.223964580411365e-01,
381     -2.400758277161838e+00, -2.549732539343734e+00,
382     4.374664141464968e+00, 2.938163982698783e+00);
383     my @d = ( 7.784695709041462e-03, 3.224671290700398e-01,
384     2.445134137142996e+00, 3.754408661907416e+00);
385    
386     # Define break-points.
387     my $plow = 0.02425;
388     my $phigh = 1 - $plow;
389    
390     # Rational approximation for lower region:
391     if ( $p < $plow ) {
392     my $q = sqrt(-2*log($p));
393     return ((((($c[0]*$q+$c[1])*$q+$c[2])*$q+$c[3])*$q+$c[4])*$q+$c[5]) /
394     (((($d[0]*$q+$d[1])*$q+$d[2])*$q+$d[3])*$q+1);
395     }
396    
397     # Rational approximation for upper region:
398     if ( $phigh < $p ) {
399     my $q = sqrt(-2*log(1-$p));
400     return -((((($c[0]*$q+$c[1])*$q+$c[2])*$q+$c[3])*$q+$c[4])*$q+$c[5]) /
401     (((($d[0]*$q+$d[1])*$q+$d[2])*$q+$d[3])*$q+1);
402     }
403    
404     # Rational approximation for central region:
405     my $q = $p - 0.5;
406     my $r = $q*$q;
407     return ((((($a[0]*$r+$a[1])*$r+$a[2])*$r+$a[3])*$r+$a[4])*$r+$a[5])*$q /
408     ((((($b[0]*$r+$b[1])*$r+$b[2])*$r+$b[3])*$r+$b[4])*$r+1);
409     }

Properties

Name Value
svn:executable *