ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/mimap_asm.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (13 years, 1 month ago) by gpertea
File size: 6660 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 mimap_asm.pl [-o <outfile>] [-s <minscore>] <sorted_bowtie.map>
8 Takes sorted bowtie .map data from miRNA mappings and outputs "assembled"
9 transcripts\/loci with depth of coverage information
10
11 The bowtie input MUST have been sorted by chromosome and coordinate,
12 with a command like this:
13 sort -k3,3 -k4,4n bowtie_output.map > bowtie_sorted.map
14 /;
15 umask 0002;
16 getopts('Ds:o:') || die($usage."\n");
17 my $outfile=$Getopt::Std::opt_o;
18 my ($maxasmlen, $maxasm);
19 my $debug=$Getopt::Std::opt_D;
20 if ($outfile) {
21 open(FOUT, '>'.$outfile) || die("Error creating output file $outfile\n");
22 select(FOUT);
23 }
24 my $minscore=$Getopt::Std::opt_s || 1.8;
25
26 my ($locchr, $locstrand, $locstart, $locend, $loccov, $locseq);
27 my @locreads; # list of [readname, chr_alnstart, chr_alnend, num_mismatches, strand]
28 my $lcount=0;
29
30 while (<>) {
31 next if m/^#/;
32 chomp;
33 my ($read, $strand, $chr, $chrstart, $rseq, $rquals, $num, $mism)=split();
34 next unless $rseq && length($rseq)==length($rquals);
35 $chrstart++; # because bowtie coordinate is 0-based
36 my ($rcov)=($read=~m/_x(\d+)$/);
37 my ($rtrimseq, $tstart, $tend, $num_mism)=mapTrim($strand, $chrstart, $rseq, $mism);
38 # DEBUG display:
39 my $debug_line;
40 if ($debug) {
41 my $lpad=' 'x($tstart-$chrstart);
42
43 $debug_line="DBG: ".join("\t", $chr.$strand, $chrstart, $rseq, length($rseq), $mism." \t\t locpos=[$locstart,$locend]\n",
44 ' ',$tstart, $lpad.$rtrimseq, $tend, "($num_mism mismatches left) tpos=[$tstart,$tend]")."\n";
45 }
46 if (($locchr ne $chr) || ($locstart==0) || ($tstart+12>$locend)) {
47 #new chromosome, or different locus
48 writeLoc() if $locchr; #flush previous locus, if any
49 # now just create the new locus:
50 ($locchr, $locstrand, $locstart, $locend, $loccov, $locseq)=
51 ($chr, $strand, $tstart, $tend, $rcov, $rtrimseq);
52 @locreads=([$read, $tstart, $tend, $num_mism, $strand]);
53 if ($debug) {
54 print STDOUT $debug_line;
55 }
56 next;
57 }
58 if ($debug) {
59 print STDOUT $debug_line;
60 #if ($locstrand && $locstrand ne $strand) {
61 # print STDERR "Warning: merging opposite strand match ($read) into locus at $locchr:$locstart)\n";
62 # }
63 }
64 # -- merge into current locus:
65 if ($tstart<$locstart) {
66 # - shouldn't happen because the mappings are sorted !
67 #copy the part of $rtrimseq that's sticking out on the left
68 $locseq=substr($rtrimseq, 0, $locstart-$tstart).$locseq;
69 $locstart=$tstart;
70 print STDOUT "DBG: pre: $locseq\n" if $debug;
71 }
72 if ($tend>$locend) {
73 #append the part of rtrimseq that's sticking out on the right
74 $locseq.=substr($rtrimseq, -($tend-$locend));
75 print STDOUT "DBG: post: $locseq\n" if $debug;
76 $locend=$tend;
77 }
78 $loccov+=$rcov;
79 push(@locreads, [$read, $tstart, $tend, $num_mism, $strand]);
80 # -- merging into current locus
81 }
82
83 writeLoc();
84
85 print STDERR "Done. Maximum assembly length=$maxasmlen ($maxasm).\n";
86
87 # END of script here:
88 if ($outfile) {
89 select(STDOUT);
90 close(FOUT);
91 }
92
93 #------- SUBROUTINES -----------
94
95 sub mapTrim {
96 my ($strand, $cstart, $seq, $mism)=@_;
97 my $len=length($seq);
98 my $cend=$cstart+$len-1;
99 return ($seq, $cstart, $cend, 0) unless $mism;
100 my @b=unpack('(A)*',$seq);
101 my @mm=map { $_=[ (m/^(\d+)\:(\w)\>(\w)/) ] } (split(/\,/, $mism));
102 my ($ltrimstart, $ltrimlen, $rtrimstart, $rtrimlen)=(-1,0,0,0);
103 my $prevtrim=1000000;
104 my $halflen = $len>>1;
105 #mismatches are sorted
106 @mm = reverse(@mm) if ($strand eq '-');
107 my @rmm;
108 my ($rmmdel, $lmmdel); #trimmed mismatches
109 foreach my $m (@mm) {
110 #my ($x, $gn, $rn)=($m=~m/^(\d+)\:(\w)\>(\w)/);
111 my ($x,$gn, $rn) = @$m; # $x values are always increasing
112 $x=$len-$x-1 if ($strand eq '-'); # $ x values are always increasing
113 die("Error: nucleotide correction failure at pos $x (len $len, strand $strand)!\n(seq=$seq with $mism, $rn != $b[$x])\n")
114 unless $rn eq $b[$x];
115 $b[$x]=$gn; #restore the genomic sequence
116 if ($x>$halflen) {
117 unshift(@rmm, [$x, $gn, $rn]);
118 next;
119 }
120 # -- trimming left end
121 if ($x<2 && $ltrimlen==0) { $ltrimstart=$x; $lmmdel++; $ltrimlen++; $prevtrim=$x; next; }
122 if (($x-$prevtrim)==1 || ($ltrimlen>1 && $x-$prevtrim==2)) {
123 $ltrimlen+=($x-$prevtrim);
124 $lmmdel++;
125 $prevtrim=$x;
126 }
127 }#for each mismatch
128 #try trimming the other end
129 $prevtrim=1000000;
130 foreach my $m (@rmm) {
131 my ($x,$gn, $rn) = @$m; # $x values are always decreasing
132 if ($x>$len-3 && $rtrimlen==0) { $rtrimstart=$x; $rmmdel++; $rtrimlen++; $prevtrim=$x; next; }
133 if ($prevtrim-$x==1 || ($rtrimlen>1 && $prevtrim-$x==2)) {
134 $rtrimlen+=($prevtrim-$x);
135 $rmmdel++;
136 $prevtrim=$x;
137 }
138 }
139
140 # 0-based coordinates (within the read) of the "non-trimmed" part
141 my $num_mism=@mm;
142 my $tstart= ($ltrimstart==0 || $ltrimlen>1) ? $ltrimstart+$ltrimlen : 0 ;
143 $num_mism-=$lmmdel if $tstart>0;
144 my $tend = ($rtrimstart==$len-1 || $rtrimlen>1) ? $rtrimstart-$rtrimlen : $len-1 ;
145 $num_mism-=$rmmdel if $tend<$len-1;
146 return (join('',@b[$tstart .. $tend]), $cstart+$tstart, $cstart+$tend, $num_mism);
147 }
148
149
150 sub writeLoc {
151 return unless $locchr && $locstart;
152 $lcount++;
153 my @rdata;
154 my ($mapscore, $ftotal, $rtotal);
155 #also recompute strand, in case we have both strand alignments
156 foreach my $d (@locreads) {
157 my ($read, $rmstart, $rmend, $rmism, $rstrand)=@$d;
158 push(@rdata, $read.$rstrand.':'.($rmstart-$locstart).'-'.($rmend-$locstart));
159 my ($rcov)=($read=~m/_x(\d+)$/);
160 my $rmscore=(($rmend-$rmstart-$rmism)*$rcov)/($rmend-$rmstart);
161 if ($rstrand eq '-') { $rtotal+=$rmscore; }
162 else { $ftotal+=$rmscore; }
163 $mapscore+=$rmscore;
164 }
165
166 $locstrand = ($rtotal>$ftotal)?'-':'+';
167 $locseq = reverseComplement($locseq) if $locstrand eq '-';
168 #format:
169 #
170 # asm_id, chr, strand, chr_start, chr_end, coverage, mapping_score, annotation, sequence, reads
171 my $nscore=$mapscore/$loccov;
172 if ($mapscore>$minscore && (@rdata>1 || $nscore>0.96)) {
173 print join("\t", sprintf('miasm_%06d', $lcount), $locchr,
174 $locstrand, $locstart, $locend, $loccov.'x', sprintf('%.2f',$mapscore), '.', $locseq)."\t".
175 join(',', @rdata)."\n";
176 if (length($locseq)>$maxasmlen) {
177 $maxasmlen=length($locseq);
178 $maxasm=sprintf('miasm_%06d', $lcount);
179 }
180 }
181 #DEBUG:
182 print "-----------------------------------------\n" if $debug;
183 @locreads=();
184 ($locchr, $locstrand, $locstart, $locend, $locseq, $loccov)=();
185 }
186
187 sub reverseComplement {
188 my $s=reverse($_[0]);
189 $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
190 return $s;
191 }

Properties

Name Value
svn:executable *