1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
mimap_asm.pl [-o <outfile>] [-s <minscore>] <sorted_bowtie.map> |
8 |
Takes sorted bowtie .map data from miRNA mappings and outputs "assembled" |
9 |
transcripts\/loci with depth of coverage information |
10 |
|
11 |
The bowtie input MUST have been sorted by chromosome and coordinate, |
12 |
with a command like this: |
13 |
sort -k3,3 -k4,4n bowtie_output.map > bowtie_sorted.map |
14 |
/; |
15 |
umask 0002; |
16 |
getopts('Ds:o:') || die($usage."\n"); |
17 |
my $outfile=$Getopt::Std::opt_o; |
18 |
my ($maxasmlen, $maxasm); |
19 |
my $debug=$Getopt::Std::opt_D; |
20 |
if ($outfile) { |
21 |
open(FOUT, '>'.$outfile) || die("Error creating output file $outfile\n"); |
22 |
select(FOUT); |
23 |
} |
24 |
my $minscore=$Getopt::Std::opt_s || 1.8; |
25 |
|
26 |
my ($locchr, $locstrand, $locstart, $locend, $loccov, $locseq); |
27 |
my @locreads; # list of [readname, chr_alnstart, chr_alnend, num_mismatches, strand] |
28 |
my $lcount=0; |
29 |
|
30 |
while (<>) { |
31 |
next if m/^#/; |
32 |
chomp; |
33 |
my ($read, $strand, $chr, $chrstart, $rseq, $rquals, $num, $mism)=split(); |
34 |
next unless $rseq && length($rseq)==length($rquals); |
35 |
$chrstart++; # because bowtie coordinate is 0-based |
36 |
my ($rcov)=($read=~m/_x(\d+)$/); |
37 |
my ($rtrimseq, $tstart, $tend, $num_mism)=mapTrim($strand, $chrstart, $rseq, $mism); |
38 |
# DEBUG display: |
39 |
my $debug_line; |
40 |
if ($debug) { |
41 |
my $lpad=' 'x($tstart-$chrstart); |
42 |
|
43 |
$debug_line="DBG: ".join("\t", $chr.$strand, $chrstart, $rseq, length($rseq), $mism." \t\t locpos=[$locstart,$locend]\n", |
44 |
' ',$tstart, $lpad.$rtrimseq, $tend, "($num_mism mismatches left) tpos=[$tstart,$tend]")."\n"; |
45 |
} |
46 |
if (($locchr ne $chr) || ($locstart==0) || ($tstart+12>$locend)) { |
47 |
#new chromosome, or different locus |
48 |
writeLoc() if $locchr; #flush previous locus, if any |
49 |
# now just create the new locus: |
50 |
($locchr, $locstrand, $locstart, $locend, $loccov, $locseq)= |
51 |
($chr, $strand, $tstart, $tend, $rcov, $rtrimseq); |
52 |
@locreads=([$read, $tstart, $tend, $num_mism, $strand]); |
53 |
if ($debug) { |
54 |
print STDOUT $debug_line; |
55 |
} |
56 |
next; |
57 |
} |
58 |
if ($debug) { |
59 |
print STDOUT $debug_line; |
60 |
#if ($locstrand && $locstrand ne $strand) { |
61 |
# print STDERR "Warning: merging opposite strand match ($read) into locus at $locchr:$locstart)\n"; |
62 |
# } |
63 |
} |
64 |
# -- merge into current locus: |
65 |
if ($tstart<$locstart) { |
66 |
# - shouldn't happen because the mappings are sorted ! |
67 |
#copy the part of $rtrimseq that's sticking out on the left |
68 |
$locseq=substr($rtrimseq, 0, $locstart-$tstart).$locseq; |
69 |
$locstart=$tstart; |
70 |
print STDOUT "DBG: pre: $locseq\n" if $debug; |
71 |
} |
72 |
if ($tend>$locend) { |
73 |
#append the part of rtrimseq that's sticking out on the right |
74 |
$locseq.=substr($rtrimseq, -($tend-$locend)); |
75 |
print STDOUT "DBG: post: $locseq\n" if $debug; |
76 |
$locend=$tend; |
77 |
} |
78 |
$loccov+=$rcov; |
79 |
push(@locreads, [$read, $tstart, $tend, $num_mism, $strand]); |
80 |
# -- merging into current locus |
81 |
} |
82 |
|
83 |
writeLoc(); |
84 |
|
85 |
print STDERR "Done. Maximum assembly length=$maxasmlen ($maxasm).\n"; |
86 |
|
87 |
# END of script here: |
88 |
if ($outfile) { |
89 |
select(STDOUT); |
90 |
close(FOUT); |
91 |
} |
92 |
|
93 |
#------- SUBROUTINES ----------- |
94 |
|
95 |
sub mapTrim { |
96 |
my ($strand, $cstart, $seq, $mism)=@_; |
97 |
my $len=length($seq); |
98 |
my $cend=$cstart+$len-1; |
99 |
return ($seq, $cstart, $cend, 0) unless $mism; |
100 |
my @b=unpack('(A)*',$seq); |
101 |
my @mm=map { $_=[ (m/^(\d+)\:(\w)\>(\w)/) ] } (split(/\,/, $mism)); |
102 |
my ($ltrimstart, $ltrimlen, $rtrimstart, $rtrimlen)=(-1,0,0,0); |
103 |
my $prevtrim=1000000; |
104 |
my $halflen = $len>>1; |
105 |
#mismatches are sorted |
106 |
@mm = reverse(@mm) if ($strand eq '-'); |
107 |
my @rmm; |
108 |
my ($rmmdel, $lmmdel); #trimmed mismatches |
109 |
foreach my $m (@mm) { |
110 |
#my ($x, $gn, $rn)=($m=~m/^(\d+)\:(\w)\>(\w)/); |
111 |
my ($x,$gn, $rn) = @$m; # $x values are always increasing |
112 |
$x=$len-$x-1 if ($strand eq '-'); # $ x values are always increasing |
113 |
die("Error: nucleotide correction failure at pos $x (len $len, strand $strand)!\n(seq=$seq with $mism, $rn != $b[$x])\n") |
114 |
unless $rn eq $b[$x]; |
115 |
$b[$x]=$gn; #restore the genomic sequence |
116 |
if ($x>$halflen) { |
117 |
unshift(@rmm, [$x, $gn, $rn]); |
118 |
next; |
119 |
} |
120 |
# -- trimming left end |
121 |
if ($x<2 && $ltrimlen==0) { $ltrimstart=$x; $lmmdel++; $ltrimlen++; $prevtrim=$x; next; } |
122 |
if (($x-$prevtrim)==1 || ($ltrimlen>1 && $x-$prevtrim==2)) { |
123 |
$ltrimlen+=($x-$prevtrim); |
124 |
$lmmdel++; |
125 |
$prevtrim=$x; |
126 |
} |
127 |
}#for each mismatch |
128 |
#try trimming the other end |
129 |
$prevtrim=1000000; |
130 |
foreach my $m (@rmm) { |
131 |
my ($x,$gn, $rn) = @$m; # $x values are always decreasing |
132 |
if ($x>$len-3 && $rtrimlen==0) { $rtrimstart=$x; $rmmdel++; $rtrimlen++; $prevtrim=$x; next; } |
133 |
if ($prevtrim-$x==1 || ($rtrimlen>1 && $prevtrim-$x==2)) { |
134 |
$rtrimlen+=($prevtrim-$x); |
135 |
$rmmdel++; |
136 |
$prevtrim=$x; |
137 |
} |
138 |
} |
139 |
|
140 |
# 0-based coordinates (within the read) of the "non-trimmed" part |
141 |
my $num_mism=@mm; |
142 |
my $tstart= ($ltrimstart==0 || $ltrimlen>1) ? $ltrimstart+$ltrimlen : 0 ; |
143 |
$num_mism-=$lmmdel if $tstart>0; |
144 |
my $tend = ($rtrimstart==$len-1 || $rtrimlen>1) ? $rtrimstart-$rtrimlen : $len-1 ; |
145 |
$num_mism-=$rmmdel if $tend<$len-1; |
146 |
return (join('',@b[$tstart .. $tend]), $cstart+$tstart, $cstart+$tend, $num_mism); |
147 |
} |
148 |
|
149 |
|
150 |
sub writeLoc { |
151 |
return unless $locchr && $locstart; |
152 |
$lcount++; |
153 |
my @rdata; |
154 |
my ($mapscore, $ftotal, $rtotal); |
155 |
#also recompute strand, in case we have both strand alignments |
156 |
foreach my $d (@locreads) { |
157 |
my ($read, $rmstart, $rmend, $rmism, $rstrand)=@$d; |
158 |
push(@rdata, $read.$rstrand.':'.($rmstart-$locstart).'-'.($rmend-$locstart)); |
159 |
my ($rcov)=($read=~m/_x(\d+)$/); |
160 |
my $rmscore=(($rmend-$rmstart-$rmism)*$rcov)/($rmend-$rmstart); |
161 |
if ($rstrand eq '-') { $rtotal+=$rmscore; } |
162 |
else { $ftotal+=$rmscore; } |
163 |
$mapscore+=$rmscore; |
164 |
} |
165 |
|
166 |
$locstrand = ($rtotal>$ftotal)?'-':'+'; |
167 |
$locseq = reverseComplement($locseq) if $locstrand eq '-'; |
168 |
#format: |
169 |
# |
170 |
# asm_id, chr, strand, chr_start, chr_end, coverage, mapping_score, annotation, sequence, reads |
171 |
my $nscore=$mapscore/$loccov; |
172 |
if ($mapscore>$minscore && (@rdata>1 || $nscore>0.96)) { |
173 |
print join("\t", sprintf('miasm_%06d', $lcount), $locchr, |
174 |
$locstrand, $locstart, $locend, $loccov.'x', sprintf('%.2f',$mapscore), '.', $locseq)."\t". |
175 |
join(',', @rdata)."\n"; |
176 |
if (length($locseq)>$maxasmlen) { |
177 |
$maxasmlen=length($locseq); |
178 |
$maxasm=sprintf('miasm_%06d', $lcount); |
179 |
} |
180 |
} |
181 |
#DEBUG: |
182 |
print "-----------------------------------------\n" if $debug; |
183 |
@locreads=(); |
184 |
($locchr, $locstrand, $locstart, $locend, $locseq, $loccov)=(); |
185 |
} |
186 |
|
187 |
sub reverseComplement { |
188 |
my $s=reverse($_[0]); |
189 |
$s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/; |
190 |
return $s; |
191 |
} |