1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
|
5 |
my $usage = q/Usage: |
6 |
mirfind_common.pl [-t total1:total2] <file1.mirfind> <file2.mirfind> |
7 |
Takes output files of mirfind_score.pl and reports potential miRNAs |
8 |
that have the same location of the mature sequence; |
9 |
A coordinate variation of 1 for either ends is allowed. |
10 |
|
11 |
(input files are assumed sorted by region start coordinate, |
12 |
as they are when they are created by mirfind_score.pl) |
13 |
|
14 |
/; |
15 |
umask 0002; |
16 |
getopts('t:o:') || die($usage."\n"); |
17 |
my $outfile=$Getopt::Std::opt_o; |
18 |
if ($outfile) { |
19 |
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
20 |
select(OUTF); |
21 |
} |
22 |
# -- |
23 |
die ($usage."\n") unless @ARGV==2; |
24 |
my @alines; |
25 |
my @blines; |
26 |
my $totals=$Getopt::Std::opt_t; |
27 |
my @total=split(/[\:\,\/]/,$totals); |
28 |
my $ratio=$total[1]/$total[0] if $total[0]>0; |
29 |
open(FILA, $ARGV[0]) || die ("Error opening file $ARGV[0]!\n"); |
30 |
open(FILB, $ARGV[1]) || die ("Error opening file $ARGV[1]!\n"); |
31 |
while (<FILA>) { |
32 |
chomp; |
33 |
my @t=split(/\t/); |
34 |
push(@alines, [@t,0]); |
35 |
} |
36 |
close(FILA); |
37 |
@alines = sort { ($$a[1] eq $$b[1]) ? $$a[3]<=>$$b[3] : $$a[1] cmp $$b[1] } @alines; |
38 |
while (<FILB>) { |
39 |
chomp; |
40 |
my @t=split(/\t/); |
41 |
push(@blines, [@t, 0]); |
42 |
} |
43 |
close(FILB); |
44 |
@blines = sort { ($$a[1] eq $$b[1]) ? $$a[3]<=>$$b[3] : $$a[1] cmp $$b[1] } @blines; |
45 |
|
46 |
my ($a_reg, $a_start,$a_chr, $a_strand, $a_end, $a_mstart, $a_mend, $acov, |
47 |
$b_reg, $b_start,$b_chr, $b_strand, $b_end, $b_mstart, $b_mend, $bcov); |
48 |
my $anext; |
49 |
my @ta; #ta[13] has the intron info, if any |
50 |
my @tb; |
51 |
my ($ad, $ai, $bd, $bi); |
52 |
$bi=-1; |
53 |
for ($ai=0; $ai<@alines; $ai++) { |
54 |
$ad=$alines[$ai]; |
55 |
@ta=@$ad; |
56 |
if ($a_start && $a_chr eq $ta[1] && $a_start>$ta[3]) { |
57 |
die("Wrong order for $ARGV[0] lines?! ($a_reg vs $ta[0])!\n"); |
58 |
} |
59 |
#($a_start, $a_end)=($ta[0]=~m/\|(\d+)\-(\d+)$/); |
60 |
($a_start, $a_end)=($ta[3],$ta[4]); |
61 |
($a_reg, $a_chr, $a_strand)=@ta[0..2]; |
62 |
($a_mstart, $a_mend)=($ta[8]=~m/m\|(\d+)\-(\d+)/); |
63 |
($acov)=($ta[6]=~m/(\d+)$/); |
64 |
$ad->[6]=$acov; |
65 |
$ta[6]=$acov; |
66 |
die ("Error parsing mstar/mend!\n") unless $a_mend>$a_mstart && $a_mstart>1; |
67 |
goto AB_CMP if ($anext); |
68 |
NEXT_B: |
69 |
$bi++; |
70 |
last unless $bi<@blines; |
71 |
$bd=$blines[$bi]; |
72 |
@tb=@$bd; |
73 |
if ($b_start && $b_chr eq $tb[1] && $b_start>$tb[3]) { |
74 |
die("Wrong order for $ARGV[1] lines?! ($b_reg vs $tb[0])!\n"); |
75 |
} |
76 |
($b_start, $b_end)=($tb[3],$tb[4]); |
77 |
($b_reg, $b_chr, $b_strand)=@tb[0..2]; |
78 |
($b_mstart, $b_mend)=($tb[8]=~m/m\|(\d+)\-(\d+)/); |
79 |
($bcov)=($tb[6]=~m/(\d+)$/); |
80 |
$bd->[6]=$bcov; |
81 |
$tb[6]=$bcov; |
82 |
AB_CMP: |
83 |
$anext=0; |
84 |
if ($b_chr gt $a_chr) { # a lags behind |
85 |
$anext=1; |
86 |
next; |
87 |
} |
88 |
elsif ($a_chr gt $b_chr) { # b lags behind |
89 |
goto NEXT_B; |
90 |
} |
91 |
# same chromosome: |
92 |
if ($b_start>$a_end) { # a lags behind |
93 |
$anext=1; |
94 |
next; |
95 |
} |
96 |
if ($a_start>$b_end) { # b lags behind |
97 |
goto NEXT_B; |
98 |
} |
99 |
# - hairpin overlap here, check for mature miRNA overlap |
100 |
if (abs($a_mstart-$b_mstart)<5 && abs($a_mend-$b_mend)<5) { # && |
101 |
#abs($a_start-$b_start)<7 && abs($a_end-$b_end)<7) { |
102 |
# common miRNA prediction here |
103 |
#print join("\t", $ta[0], @ta[5..8], join('|',@ta[10..12]), |
104 |
# $tb[0], @tb[5..8], join('|',@tb[10..12]), $ta[14])."\n"; |
105 |
my ($hstart, $hend, $mseq, $mcoords) = ($ta[6]>$tb[6]) ? (@ta[3..4], @ta[7..8]) : (@tb[3..4], @tb[7..8]); |
106 |
my ($mstart, $mend)=($mcoords=~m/m\|(\d+)\-(\d+)/); |
107 |
my $foldchanges=''; |
108 |
if ($ratio) { |
109 |
my ($v1,$v2)=($ta[6]*$ratio, $tb[6]); |
110 |
my $fc=($v1>$v2)? -($v1/$v2) : $v2/$v1; |
111 |
$foldchanges="\t".sprintf('%.2f',$fc)."\t".sprintf('%.2f',abs($fc)); |
112 |
|
113 |
} |
114 |
print join("\t", join('|',@ta[1..2], $hstart.'_'.$hend), $mseq, $mstart, $mend, @ta[5..6], $ta[10].'|'.$ta[11], $ta[12], |
115 |
@tb[5..6], $tb[10].'|'.$tb[11], $tb[12]).$foldchanges."\n"; |
116 |
|
117 |
$bd->[15]=1; |
118 |
$ad->[15]=1; |
119 |
} |
120 |
if ($b_end>$a_end) { |
121 |
$anext=1; |
122 |
next; |
123 |
} |
124 |
else { |
125 |
goto NEXT_B; |
126 |
} |
127 |
} |
128 |
|
129 |
# -- now also add those entries only expressed in one sample but not the other |
130 |
foreach my $d (@alines) { |
131 |
next if $d->[15]==1 || $d->[6]<15; |
132 |
my @t=@$d; |
133 |
my ($hstart, $hend, $mseq, $mcoords) = (@t[3..4], @t[7..8]); |
134 |
my ($mstart, $mend)=($mcoords=~m/m\|(\d+)\-(\d+)/); |
135 |
my $foldchanges=''; |
136 |
if ($ratio) { |
137 |
my ($v1,$v2)=($t[6]*$ratio, 1); |
138 |
my $fc=($v1>$v2)? -($v1/$v2) : $v2/$v1; |
139 |
$foldchanges="\t".sprintf('%.2f',$fc)."\t".sprintf('%.2f',abs($fc)); |
140 |
} |
141 |
|
142 |
print join("\t", join('|',@t[1..2], $hstart.'_'.$hend), $mseq, $mstart, $mend, @t[5..6], $t[10].'|'.$t[11], $t[12], |
143 |
'-','-', '-', '-' ).$foldchanges."\n"; |
144 |
|
145 |
#print join("\t", $t[0], $t[5], 'x'.$t[6], @t[7..8], join('|',@t[10..12]), |
146 |
# '-', '-','-','-','-', '-', $t[14])."\n"; |
147 |
} |
148 |
|
149 |
foreach my $d (@blines) { |
150 |
next if $d->[15]==1 || $d->[6]<15; |
151 |
my @t=@$d; |
152 |
my ($hstart, $hend, $mseq, $mcoords) = (@t[3..4], @t[7..8]); |
153 |
my ($mstart, $mend)=($mcoords=~m/m\|(\d+)\-(\d+)/); |
154 |
my $foldchanges=''; |
155 |
if ($ratio) { |
156 |
my ($v1,$v2)=(1*$ratio, $t[6]); |
157 |
my $fc=($v1>$v2)? -($v1/$v2) : $v2/$v1; |
158 |
$foldchanges="\t".sprintf('%.2f',$fc)."\t".sprintf('%.2f',abs($fc)); |
159 |
} |
160 |
print join("\t", join('|',@t[1..2], $hstart.'_'.$hend), $mseq, $mstart, $mend, '-','-', '-', '-', |
161 |
@t[5..6], $t[10].'|'.$t[11], $t[12]).$foldchanges."\n"; |
162 |
#print join("\t", $t[0], '-','-','-','-', '-', |
163 |
# $t[0], $t[5], 'x'.$t[6], @t[7..8], join('|',@t[10..12]), $t[14])."\n"; |
164 |
} |
165 |
|
166 |
# -- |
167 |
if ($outfile) { |
168 |
select(STDOUT); |
169 |
close(OUTF); |
170 |
} |
171 |
|
172 |
#************ Subroutines ************** |