ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/segment_juncs.cpp
Revision: 239
Committed: Mon May 14 21:01:08 2012 UTC (12 years, 3 months ago) by gpertea
File size: 161538 byte(s)
Log Message:
wip prep_reads.cpp, tophat

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * segment_juncs.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <cstring>
25 #include <bitset>
26 #include <seqan/basic.h>
27 #include <seqan/sequence.h>
28 #include <seqan/find.h>
29 #include <seqan/file.h>
30 #include <seqan/modifier.h>
31 #include <seqan/align.h>
32 #include <seqan/graph_align.h>
33 #include <getopt.h>
34
35 #include <boost/thread.hpp>
36
37 #include "common.h"
38 #include "utils.h"
39 #include "bwt_map.h"
40 #include "tokenize.h"
41 #include "segments.h"
42 #include "reads.h"
43 #include "junctions.h"
44 #include "insertions.h"
45 #include "deletions.h"
46 #include "fusions.h"
47
48 using namespace seqan;
49 using namespace std;
50 using namespace __gnu_cxx;
51
52 // daehwan //geo
53 //#define B_DEBUG 1
54
55 void print_usage()
56 {
57 fprintf(stderr, "Usage: segment_juncs <ref.fa> <segment.juncs> <segment.insertions> <segment.deletions> <left_reads.fq> <left_reads.bwtout> <left_seg1.bwtout,...,segN.bwtout> [right_reads.fq right_reads.bwtout right_seg1.bwtout,...,right_segN.bwtout]\n");
58 }
59
60 // This is the maximum number of bowtie mismatches allower per segment hit
61 static const int num_bowtie_mismatches = 2;
62 static const int max_cov_juncs = 5000000;
63 // static const int max_cov_juncs = std::numeric_limits<int>::max();
64 static const int max_seg_juncs = 10000000;
65
66 int max_microexon_stretch = 2000;
67 int butterfly_overhang = 6;
68 int min_cov_length = 20;
69
70 void get_seqs(istream& ref_stream,
71 RefSequenceTable& rt,
72 bool keep_seqs = true)
73 {
74 while(ref_stream.good() &&
75 !ref_stream.eof())
76 {
77 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
78 string name;
79 readMeta(ref_stream, name, Fasta());
80 string::size_type space_pos = name.find_first_of(" \t\r");
81 if (space_pos != string::npos)
82 {
83 name.resize(space_pos);
84 }
85 fprintf(stderr, "\tLoading %s...", name.c_str());
86 seqan::read(ref_stream, *ref_str, Fasta());
87 fprintf(stderr, "done\n");
88 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
89 if (!keep_seqs)
90 delete ref_str;
91 }
92 }
93
94 RefSeg seg_from_bowtie_hit(const BowtieHit& T)
95 {
96 RefSeg r_seg(T.ref_id(), POINT_DIR_DONTCARE, T.antisense_align(), READ_DONTCARE, 0, 0);
97
98 if (T.antisense_align())
99 {
100 r_seg.left = max(0, T.right() - 2);
101 r_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
102 r_seg.points_where = POINT_DIR_RIGHT;
103 }
104 else
105 {
106 r_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
107 r_seg.right = T.left() + 2; // num allowed bowtie mismatches
108 r_seg.points_where = POINT_DIR_LEFT;
109 }
110
111 return r_seg;
112 }
113
114 pair<RefSeg, RefSeg> segs_from_bowtie_hits(const BowtieHit& T,
115 const BowtieHit& H)
116 {
117 pair<RefSeg, RefSeg> seg_pair;
118 if (H.antisense_align() == false &&
119 abs((H.right() + 1) - T.left()) < (int)max_segment_intron_length)
120 {
121 RefSeg left_seg(H.ref_id(), POINT_DIR_RIGHT, H.antisense_align(), READ_DONTCARE, 0, 0);
122 left_seg.left = max(0, H.right() - 2);
123 left_seg.right = H.right() + (H.right() - H.left() + 1); // num allowed bowtie mismatches
124
125 RefSeg right_seg(T.ref_id(), POINT_DIR_LEFT, T.antisense_align(), READ_DONTCARE, 0, 0);
126 right_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
127 right_seg.right = T.left() + 2; // num allowed bowtie mismatches
128
129 seg_pair = make_pair(left_seg, right_seg);
130 }
131 else if (H.antisense_align() == true &&
132 abs((T.right() + 1) - H.left()) < (int)max_segment_intron_length)
133 {
134 RefSeg left_seg(T.ref_id(), POINT_DIR_RIGHT, T.antisense_align(), READ_DONTCARE, 0, 0);
135 left_seg.left = max(0, T.right() - 2);
136 left_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
137
138 RefSeg right_seg(H.ref_id(), POINT_DIR_LEFT, H.antisense_align(), READ_DONTCARE, 0, 0);
139 right_seg.left = max(0, H.left() - (H.right() - H.left() + 1));
140 right_seg.right = H.left() + 2; // num allowed bowtie mismatches
141
142 seg_pair = make_pair(left_seg, right_seg);
143 }
144 return seg_pair;
145 }
146
147 //static const size_t half_splice_mer_len = 6;
148 //static const size_t splice_mer_len = 2 * half_splice_mer_len;
149
150 struct MerExtension
151 {
152 static const int MAX_EXTENSION_BP = 14;
153 uint32_t left_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
154 uint8_t left_ext_len : 4; // how many bases in this extension
155
156 uint32_t right_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
157 uint8_t right_ext_len : 4; // how many bases in this extension
158
159 MerExtension() : left_dna_str(0), left_ext_len(0), right_dna_str(0), right_ext_len(0) {}
160
161 bool operator<(const MerExtension& rhs) const
162 {
163 if (left_dna_str != rhs.left_dna_str)
164 return left_dna_str < rhs.left_dna_str;
165 if (left_ext_len != rhs.left_ext_len)
166 return left_ext_len < rhs.left_ext_len;
167 if (right_dna_str != rhs.right_dna_str)
168 return right_dna_str < rhs.right_dna_str;
169 if (right_ext_len != rhs.right_ext_len)
170 return right_ext_len < rhs.right_ext_len;
171 return false;
172 }
173
174 bool operator==(const MerExtension& rhs) const
175 {
176 bool eq = left_dna_str == rhs.left_dna_str &&
177 right_dna_str == rhs.right_dna_str &&
178 left_ext_len == rhs.left_ext_len &&
179 right_ext_len == rhs.right_ext_len;
180
181 return eq;
182 }
183 };
184
185
186 /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
187 /// T=3, N=4
188 uint8_t charToDna5[] = {
189 /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
191 /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
192 /* 48 */ 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 /* 64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
194 /* A C G N */
195 /* 80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196 /* T */
197 /* 96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
198 /* a c g n */
199 /* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 /* t */
201 /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
206 /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207 /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209 };
210
211
212 typedef vector<vector<MerExtension> > MerExtensionTable;
213 typedef vector<uint32_t> MerExtensionCounts;
214
215 MerExtensionTable extensions;
216 MerExtensionCounts extension_counts;
217
218 uint64_t dna5str_to_idx(const string& str)
219 {
220 uint64_t idx = 0;
221
222 for (size_t i = 0; i < str.length(); ++i)
223 {
224 idx <<=2;
225 char c = toupper(str[i]);
226 idx |= (0x3 & charToDna5[(size_t)c]);
227 }
228 return idx;
229 }
230
231 uint64_t colorstr_to_idx(const string& str)
232 {
233 uint64_t idx = 0;
234
235 for (size_t i = 0; i < str.length(); ++i)
236 {
237 idx <<=2;
238 char c = str[i];
239 idx |= (0x3 & charToDna5[(size_t)c]);
240 }
241 return idx;
242 }
243
244 void store_read_extensions(MerExtensionTable& ext_table,
245 int seq_key_len,
246 int min_ext_len,
247 const string& seq,
248 bool use_precount_table)
249 {
250 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
251 // this read.
252
253 uint64_t seed = 0;
254 bitset<256> left = 0;
255 bitset<256> right = 0;
256 const char* p = seq.c_str();
257
258 unsigned int seq_len = (int)seq.length();
259 const char* seq_end = p + seq_len;
260
261 // Build the first seed
262 while (p < seq.c_str() + (2 * seq_key_len))
263 {
264 seed <<= 2;
265 seed |= (0x3 & charToDna5[(size_t)*p]);
266 ++p;
267 }
268
269 // Build the rest of them with a sliding window, adding successive bases
270 // to the "right-remainder" word.
271 while (p < seq_end)
272 {
273 right <<= 2;
274 right |= (0x3 & charToDna5[(size_t)*p]);
275 ++p;
276 }
277
278 // This loop will construct successive seed, along with 32-bit words
279 // containing the left and right remainders for each seed
280 uint32_t i = 0;
281 size_t new_hits = 0;
282 do
283 {
284 // How many base pairs exist in the right remainder beyond what we have
285 // space for ?
286 int extra_right_bp = ((int)seq.length() -
287 (i + 2 * seq_key_len)) - MerExtension::MAX_EXTENSION_BP;
288
289 uint32_t hit_right = 0;
290 if (extra_right_bp > 0)
291 {
292 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
293 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
294 }
295 else
296 {
297 hit_right = (uint32_t)right.to_ulong();
298 }
299
300 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
301
302 //size_t prev_cap = (*mer_table)[seed].capacity();
303 //(*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
304 //cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
305
306 MerExtension ext;
307
308 ext.right_dna_str = hit_right;
309 ext.right_ext_len = min(seq_len - (2 * seq_key_len) - i,
310 (unsigned int)MerExtension::MAX_EXTENSION_BP);
311
312 ext.left_dna_str = hit_left;
313 ext.left_ext_len = min(i, (unsigned int)MerExtension::MAX_EXTENSION_BP);
314 if (use_precount_table)
315 {
316 int curr_seed = --extension_counts[seed];
317 if (curr_seed < 0 || curr_seed > (int)ext_table[seed].size())
318 {
319 fprintf(stderr, "Error: curr_seed is %d, max is %lu\n", curr_seed, (long unsigned int)ext_table[seed].size());
320 }
321
322 ext_table[seed][curr_seed] = ext;
323 }
324 else
325 {
326 ext_table[seed].push_back(ext);
327 }
328 new_hits++;
329
330 // Take the leftmost base of the seed and stick it into bp
331 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
332
333 // Move that base down to the least significant bits of bp
334 bp >>= ((seq_key_len << 2) - 2);
335
336 // And tack it onto the left remainder of the read
337 left <<= 2;
338 left |= bp;
339
340 // Now take the leftmost base of the right remainder and stick it into
341 // the rightmost position of the seed
342 uint32_t right_len = seq_len - (i + seq_key_len * 2);
343 //bp = right & (0x3uLL << ((right_len - 1) << 1));
344
345 seed <<= 2;
346 //cout << right << endl;
347 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
348 //cout <<tmp_right << endl;
349 seed |= tmp_right.to_ulong();
350 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
351
352
353 //Now remove that leftmost base of the right remainder
354 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
355 if (right_len)
356 {
357 right.set(((right_len - 1) << 1), 0);
358 right.set(((right_len - 1) << 1) + 1, 0);
359 }
360 ++i;
361
362 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
363 return;
364 }
365
366 void count_read_extensions(MerExtensionCounts& ext_counts,
367 int seq_key_len,
368 int min_ext_len,
369 const string& seq)
370 {
371 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
372 // this read.
373
374 uint64_t seed = 0;
375 bitset<256> left = 0;
376 bitset<256> right = 0;
377 const char* p = seq.c_str();
378
379 unsigned int seq_len = (int)seq.length();
380 const char* seq_end = p + seq_len;
381
382 // Build the first seed
383 while (p < seq.c_str() + (2 * seq_key_len))
384 {
385 seed <<= 2;
386 seed |= (0x3 & charToDna5[(size_t)*p]);
387 ++p;
388 }
389
390 // Build the rest of them with a sliding window, adding successive bases
391 // to the "right-remainder" word.
392 while (p < seq_end)
393 {
394 right <<= 2;
395 right |= (0x3 & charToDna5[(size_t)*p]);
396 ++p;
397 }
398
399 // This loop will construct successive seed, along with 32-bit words
400 // containing the left and right remainders for each seed
401 uint32_t i = 0;
402 size_t new_hits = 0;
403 do
404 {
405 ext_counts[seed]++;
406
407 new_hits++;
408
409 // Take the leftmost base of the seed and stick it into bp
410 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
411
412 // Move that base down to the least significant bits of bp
413 bp >>= ((seq_key_len << 2) - 2);
414
415 // And tack it onto the left remainder of the read
416 left <<= 2;
417 left |= bp;
418
419 // Now take the leftmost base of the right remainder and stick it into
420 // the rightmost position of the seed
421 uint32_t right_len = seq_len - (i + seq_key_len * 2);
422 //bp = right & (0x3uLL << ((right_len - 1) << 1));
423
424 seed <<= 2;
425 //cout << right << endl;
426 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
427 //cout <<tmp_right << endl;
428 seed |= tmp_right.to_ulong();
429 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
430
431
432 //Now remove that leftmost base of the right remainder
433 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
434 if (right_len)
435 {
436 right.set(((right_len - 1) << 1), 0);
437 right.set(((right_len - 1) << 1) + 1, 0);
438 }
439 ++i;
440
441 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
442 return;
443 }
444
445 //void count_read_mers(FILE* reads_file, size_t half_splice_mer_len)
446 void count_read_mers(string& reads_file, size_t half_splice_mer_len)
447 {
448 Read read;
449 size_t splice_mer_len = 2 * half_splice_mer_len;
450 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
451 extension_counts.resize(mer_table_size);
452 ReadStream readstream(reads_file);
453 //FLineReader fr(reads_file);
454 //while(!feof(reads_file))
455 while (readstream.get_direct(read, reads_format)) {
456 /*
457 while (!fr.isEof())
458 {
459 read.clear();
460
461 // Get the next read from the file
462 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
463 break;
464 if (reads_format == FASTQ)
465 {
466 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
467 break;
468 }
469 */
470 if (color && !readstream.isBam())
471 // erase the primer and the adjacent color
472 read.seq.erase(0, 2);
473
474 if (read.seq.size() > 32)
475 read.seq.resize(32);
476
477 count_read_extensions(extension_counts,
478 half_splice_mer_len,
479 half_splice_mer_len,
480 read.seq);
481 }
482
483 //reads_file.rewind();
484 }
485
486 void compact_extension_table()
487 {
488 for (size_t i = 0; i < extensions.size(); ++i)
489 {
490 vector<MerExtension>& exts = extensions[i];
491 sort(exts.begin(), exts.end());
492 vector<MerExtension>::iterator new_end = unique(exts.begin(), exts.end());
493 exts.erase(new_end, exts.end());
494 vector<MerExtension>(exts).swap(exts);
495 }
496 }
497
498 void prune_extension_table(uint8_t max_extension_bp)
499 {
500 uint32_t mask = ~(0xFFFFFFFFuLL << (max_extension_bp << 1));
501
502 for (size_t i = 0; i < extensions.size(); ++i)
503 {
504 vector<MerExtension>& exts = extensions[i];
505 for (size_t j = 0; j < exts.size(); ++j)
506 {
507 MerExtension& ex = exts[j];
508 if (ex.left_ext_len > max_extension_bp)
509 {
510 ex.left_ext_len = max_extension_bp;
511 ex.left_dna_str &= mask;
512 }
513
514 if (ex.right_ext_len > max_extension_bp)
515 {
516 ex.right_dna_str >>= ((ex.right_ext_len - max_extension_bp) << 1);
517 ex.right_ext_len = max_extension_bp;
518 }
519 }
520 }
521 }
522
523 //void store_read_mers(FILE* reads_file, size_t half_splice_mer_len)
524 void store_read_mers(string& reads_file, size_t half_splice_mer_len)
525 {
526 Read read;
527 size_t splice_mer_len = 2 * half_splice_mer_len;
528
529 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
530 extensions.resize(mer_table_size);
531
532 size_t num_indexed_reads = 0;
533 ReadStream readstream(reads_file);
534 while (readstream.get_direct(read, reads_format)) {
535 /*
536 FLineReader fr(reads_file);
537 //while(!feof(reads_file))
538 while(!fr.isEof())
539 {
540 read.clear();
541
542 // Get the next read from the file
543 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
544 break;
545 if (reads_format == FASTQ)
546 {
547 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
548 break;
549 }
550 */
551 if (color && !readstream.isBam())
552 // erase the primer and the adjacent color
553 read.seq.erase(0, 2);
554
555 if (read.seq.size() > 32)
556 read.seq.resize(32);
557
558 store_read_extensions(extensions,
559 half_splice_mer_len,
560 half_splice_mer_len,
561 read.seq,
562 true);
563
564 // Do NOT index the reverse of the reads
565
566 num_indexed_reads++;
567 if (num_indexed_reads % 1000000 == 0)
568 {
569 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads);
570 //compact_extension_table();
571 }
572 }
573
574 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads)
575 uint64_t num_extensions = 0;
576 for (size_t i = 0; i < extensions.size(); ++i)
577 {
578 num_extensions += extensions[i].size();
579 }
580 //fprintf (stderr, "Total extensions: %lu\n", (long unsigned int)num_extensions);
581 //reads_file.rewind();
582 }
583
584 //void index_read_mers(vector<FILE*> reads_files,
585 void index_read_mers(vector<string>& reads_files,
586 size_t half_splice_mer_len)
587 {
588 extensions.clear();
589 for (size_t i = 0; i < reads_files.size(); ++i)
590 {
591 count_read_mers(reads_files[i], half_splice_mer_len);
592 }
593
594 extensions.resize(extension_counts.size());
595
596 for (size_t i = 0; i < extension_counts.size(); ++i)
597 {
598 extensions[i].resize(extension_counts[i]);
599 }
600
601 for (size_t i = 0; i < reads_files.size(); ++i)
602 {
603 store_read_mers(reads_files[i], half_splice_mer_len);
604 }
605
606 compact_extension_table();
607 }
608
609 /** Returns the number of characters in strings w1 and w2 that match,
610 * starting from right to left
611 */
612 int get_matching_chars(uint32_t w1, uint32_t w2)
613 {
614 //find the least significant mismatching bit between w1 and w2
615 int mismatch_bit = ffs(w1 ^ w2);
616
617 // If there is no mismatching bit, the words are equal
618 if (!mismatch_bit)
619 return -1;
620
621 // Given the mismatching bit, determine where the mismatching base is
622 mismatch_bit -= 1;
623 mismatch_bit -= ((mismatch_bit) & 1);
624 mismatch_bit >>= 1;
625
626 // Return the number of matching characters.
627 return mismatch_bit;
628 }
629
630 /**
631 * Computes the Hamming distance between two 16bp words, up to a specified
632 * maximum number of mismatches.
633 */
634 uint32_t mismatching_bases(uint32_t w1_word,
635 uint32_t w2_word,
636 int len,
637 uint32_t max_mis)
638 {
639 uint32_t diffs = 0;
640
641 int shift = 0;
642 int L = len;
643 uint32_t misses = 0;
644
645 // While we haven't yet exceeded the maximum allowable mismatches,
646 // and there are still unaligned bases, keep shift-anding
647 while (shift < len && misses <= max_mis)
648 {
649 int match_chars = 0;
650
651 // Get the number of characters matching on the right sides of
652 // both words
653 match_chars = get_matching_chars(w1_word, w2_word);
654
655 // If they are equal for this shift, we are done,
656 // the loop will stop at the next iteration
657 if (match_chars == -1 || match_chars >= len)
658 {
659 match_chars = len;
660 shift = len;
661 }
662 else
663 {
664 // If there is a mismatch in the remaining words
665 // decide how much to shift by and try again
666 match_chars = min(len, match_chars);
667 int shift_chars = (match_chars + 1);
668
669 L -= shift_chars;
670
671 int shift_bits = shift_chars << 1;
672
673 // Shift right past the matching part and the first mismatching base
674 w1_word >>= (shift_bits);
675 w2_word >>= (shift_bits);
676
677 shift += shift_chars;
678 diffs++;
679 misses++;
680 }
681 }
682
683 return diffs;
684 }
685
686 uint64_t rc_dna_str(uint64_t dna_str)
687 {
688 dna_str = ~dna_str;
689 uint64_t rc = 0;
690 for (int i = 0; i < 32; i++)
691 {
692 rc <<= 2;
693 rc |= (dna_str & 0x3);
694 dna_str >>= 2;
695 }
696 return rc;
697 }
698
699 uint64_t rc_color_str(uint64_t color_str)
700 {
701 uint64_t rc = 0;
702 for (int i = 0; i < 32; ++i)
703 {
704 rc <<= 2;
705 rc |= (color_str & 0x3);
706 color_str >>= 2;
707 }
708 return rc;
709 }
710
711 struct DnaSpliceStrings
712 {
713 DnaSpliceStrings(uint64_t f, uint64_t r) : fwd_string(f), rev_string(r), first_in_string('N'), last_in_string('N') {}
714 uint64_t fwd_string;
715 uint64_t rev_string;
716
717 // for color-space purposes
718 char first_in_string;
719 char last_in_string;
720
721 bool operator<(const DnaSpliceStrings& rhs) const
722 {
723 if (fwd_string != rhs.fwd_string)
724 return fwd_string < rhs.fwd_string;
725 if (rev_string != rhs.rev_string)
726 return rev_string < rhs.rev_string;
727 return false;
728 }
729
730 bool operator==(const DnaSpliceStrings& rhs) const
731 {
732 return fwd_string == rhs.fwd_string && rev_string == rhs.rev_string;
733 }
734 };
735
736 struct IntronMotifs
737 {
738 IntronMotifs(uint32_t rid) : ref_id(rid) {}
739 uint32_t ref_id;
740
741 vector<pair<size_t, DnaSpliceStrings> > fwd_donors;
742 vector<pair<size_t, DnaSpliceStrings> > fwd_acceptors;
743 vector<pair<size_t, DnaSpliceStrings> > rev_donors;
744 vector<pair<size_t, DnaSpliceStrings> > rev_acceptors;
745
746 void unique(vector<pair<size_t, DnaSpliceStrings> >& f)
747 {
748 sort(f.begin(), f.end());
749 vector<pair<size_t, DnaSpliceStrings> >::iterator i = std::unique(f.begin(), f.end());
750 f.erase(i, f.end());
751 }
752
753 void unique()
754 {
755 unique(fwd_donors);
756 unique(fwd_acceptors);
757 unique(rev_donors);
758 unique(rev_acceptors);
759 }
760
761 void attach_mers(RefSequenceTable::Sequence& ref_str)
762 {
763 attach_upstream_mers(ref_str, fwd_donors);
764 attach_upstream_mers(ref_str, rev_acceptors);
765
766 attach_downstream_mers(ref_str, rev_donors);
767 attach_downstream_mers(ref_str, fwd_acceptors);
768 }
769
770 void attach_upstream_mers(RefSequenceTable::Sequence& ref_str,
771 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
772 {
773 for (size_t i = 0; i < dinucs.size(); ++i)
774 {
775 size_t pos = dinucs[i].first;
776 int half_splice_mer_len = 32;
777
778 if (color)
779 {
780 if (pos <= (size_t)half_splice_mer_len+1 || pos >= length(ref_str))
781 continue;
782
783 Dna5String seg_str = seqan::infixWithLength(ref_str,
784 pos - half_splice_mer_len - 1,
785 half_splice_mer_len + 1);
786 stringstream ss(stringstream::in | stringstream::out);
787 string s;
788 ss << seg_str;
789 ss >> s;
790
791 string col_seg_str = convert_bp_to_color(s, true);
792 uint64_t idx = colorstr_to_idx(col_seg_str);
793
794 dinucs[i].second.fwd_string = idx;
795 dinucs[i].second.rev_string = rc_color_str(idx);
796 dinucs[i].second.first_in_string = s[1];
797 dinucs[i].second.last_in_string = s[half_splice_mer_len];
798 }
799 else
800 {
801 if (pos <= (size_t)half_splice_mer_len || pos >= length(ref_str))
802 continue;
803
804 Dna5String seg_str = seqan::infixWithLength(ref_str,
805 pos - half_splice_mer_len,
806 half_splice_mer_len);
807
808 stringstream ss(stringstream::in | stringstream::out);
809 string s;
810 ss << seg_str;
811 ss >> s;
812 uint64_t idx = dna5str_to_idx(s);
813 dinucs[i].second.fwd_string = idx;
814 dinucs[i].second.rev_string = rc_dna_str(idx);
815 }
816 }
817 }
818
819
820 void attach_downstream_mers(RefSequenceTable::Sequence& ref_str,
821 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
822 {
823 for (size_t i = 0; i < dinucs.size(); ++i)
824 {
825 size_t pos = dinucs[i].first;
826
827 int half_splice_mer_len = 32;
828
829 if (pos + 2 + half_splice_mer_len >= length(ref_str))
830 continue;
831
832 if (color)
833 {
834 Dna5String seg_str = seqan::infixWithLength(ref_str,
835 pos + 2 - 1,
836 half_splice_mer_len + 1);
837 stringstream ss(stringstream::in | stringstream::out);
838 string s;
839 ss << seg_str;
840 ss >> s;
841
842 string col_seg_str = convert_bp_to_color(s, true);
843 uint64_t idx = colorstr_to_idx(col_seg_str);
844
845 dinucs[i].second.fwd_string = idx;
846 dinucs[i].second.rev_string = rc_color_str(idx);
847 dinucs[i].second.first_in_string = s[1];
848 dinucs[i].second.last_in_string = s[half_splice_mer_len];
849 }
850 else
851 {
852 Dna5String seg_str = seqan::infixWithLength(ref_str,
853 pos + 2,
854 half_splice_mer_len);
855
856 stringstream ss(stringstream::in | stringstream::out);
857 string s;
858 ss << seg_str;
859 ss >> s;
860 uint64_t idx = dna5str_to_idx(s);
861
862 dinucs[i].second.fwd_string = idx;
863 dinucs[i].second.rev_string = rc_dna_str(idx);
864 }
865 }
866 }
867 };
868
869 struct PackedSplice
870 {
871 PackedSplice() : left(0u), seed(0u), right(0u) {}
872 PackedSplice(uint32_t l, uint64_t s, uint32_t r) : left(l), seed(s), right(r) {}
873 uint32_t left;
874 uint64_t seed;
875 uint32_t right;
876 };
877
878 // The second element of these pairs is the left (or right) side of a splice
879 // seed from a possible junction. The first element is the sequence flanking
880 // that seed
881 typedef pair<uint32_t, uint32_t> PackedSpliceHalf;
882
883 static inline std::string u32ToDna(uint32_t a, int len) {
884 char buf[17];
885 assert(len <= 16);
886 for(int i = 0; i < len; i++) {
887 buf[len-i-1] = "ACGT"[a & 3];
888 a >>= 2;
889 }
890 buf[len] = '\0';
891 return std::string(buf);
892 }
893
894
895 PackedSpliceHalf pack_left_splice_half(const string& seq,
896 uint32_t pos_in_l,
897 unsigned int seq_key_len)
898 {
899 const char* l = seq.c_str();
900 l += pos_in_l;
901
902 const char* left_end = l;
903 l -= 16;
904
905 assert (l + seq_key_len < seq.c_str() + seq.length());
906
907 PackedSpliceHalf packed_half = make_pair(0u,0u);
908
909 // Pack up to 32 bits (16 bases) of sequence into left
910 if (l < seq.c_str())
911 l = seq.c_str();
912 while (l < left_end)
913 {
914 packed_half.first <<= 2;
915 packed_half.first |= (0x3 & charToDna5[(size_t)*l]);
916 ++l;
917 }
918
919 // Pack up the seed bits
920 for (unsigned int i = 0; i < seq_key_len; ++i)
921 {
922 packed_half.second <<= 2;
923 packed_half.second |= (0x3 & charToDna5[(size_t)*(l + i)]);
924 }
925 return packed_half;
926 }
927
928
929 PackedSpliceHalf pack_right_splice_half(const string& seq,
930 uint32_t pos,
931 unsigned int seq_key_len)
932 {
933 const char* r = seq.c_str();
934 r += pos - seq_key_len;
935
936 PackedSpliceHalf packed_half = make_pair(0u,0u);
937
938 // Pack the seed bits
939 for (unsigned int i = 0; i < seq_key_len; ++i)
940 {
941 packed_half.second <<= 2;
942 packed_half.second |= (0x3 & charToDna5[(size_t)*(r + i)]);
943 }
944
945 r += seq_key_len;
946 // Now pack 32 bits (16 bases) of sequence into left
947 const char* right_end = r + 16;
948
949 if ((size_t)(right_end - seq.c_str()) > seq.length())
950 right_end = seq.c_str() + seq.length();
951 while (r < right_end)
952 {
953 packed_half.first <<= 2;
954 packed_half.first |= (0x3 & charToDna5[(size_t)*r]);
955 ++r;
956 }
957
958 return packed_half;
959 }
960
961 PackedSplice combine_splice_halves(const PackedSpliceHalf& left_half,
962 const PackedSpliceHalf& right_half,
963 int seq_key_len)
964 {
965 uint64_t seed = left_half.second << (seq_key_len << 1) | right_half.second;
966 return PackedSplice(left_half.first,seed, right_half.first);
967 }
968
969 PackedSplice pack_splice(const string& seq,
970 int l_pos_in_seq,
971 int r_pos_in_seq,
972 unsigned int seq_key_len)
973 {
974 const char* l = seq.c_str(); // l points to beginning of left exon sequence
975 l += l_pos_in_seq;
976
977 assert (l + seq_key_len < seq.c_str() + seq.length());
978
979 const char* r = seq.c_str(); // r points to beginning of right exon sequence
980 r += r_pos_in_seq - seq_key_len;
981 //r += 2; // r points to right side of junction;
982
983 uint64_t seed = 0;
984 uint64_t left = 0;
985 uint64_t right = 0;
986
987 // Pack up the seed bits
988 for (unsigned int i = 0; i < seq_key_len; ++i)
989 {
990 seed <<= 2;
991 seed |= (0x3 & charToDna5[(size_t)*(l + i)]);
992 }
993
994 for (unsigned int i = 0; i < seq_key_len; ++i)
995 {
996 seed <<= 2;
997 seed |= (0x3 & charToDna5[(size_t)*(r + i)]);
998 }
999
1000 // Now pack 32 bits (16 bases) of sequence into left
1001 const char* left_end = l;
1002 l -= 16;
1003 if (l < seq.c_str())
1004 l = seq.c_str();
1005 while (l < left_end)
1006 {
1007 left <<= 2;
1008 left |= (0x3 & charToDna5[(size_t)*l]);
1009 ++l;
1010 }
1011
1012 r += seq_key_len;
1013 // Now pack 32 bits (16 bases) of sequence into left
1014 const char* right_end = r + 16;
1015
1016 if ((size_t)(right_end - seq.c_str()) > seq.length())
1017 right_end = seq.c_str() + seq.length();
1018 while (r < right_end)
1019 {
1020 right <<= 2;
1021 right |= (0x3 & charToDna5[(size_t)*r]);
1022 ++r;
1023 }
1024
1025 return PackedSplice((uint32_t)left,seed,(uint32_t)right);
1026 }
1027
1028
1029
1030 /* Represents a hit between a splice seed and a read. */
1031 // TODO: consider packing pos and meta into a single 32-bit int.
1032 struct ReadHit
1033 {
1034 ReadHit(uint32_t l, uint32_t r, uint32_t p, uint32_t m, bool rc)
1035 : left(l), right(r), pos(p), meta(m), reverse_complement(rc) {}
1036 uint32_t left; // 2-bits per base rep of the left remainder
1037 uint32_t right; //2-bits per base rep of the right remainder
1038 uint32_t pos; // position of the seed within the read
1039 uint32_t meta : 31;
1040 bool reverse_complement : 1;
1041 } __attribute__((packed));
1042
1043 // A MerTable maps k-mers to hits in indexed reads. See the comment for
1044 // mer_table
1045 typedef vector<ReadHit> ReadHitList;
1046 typedef vector<ReadHitList> MerTable;
1047
1048 size_t index_read(MerTable* mer_table,
1049 int seq_key_len,
1050 const string& seq,
1051 unsigned int read_num,
1052 bool reverse_complement,
1053 vector<uint64_t>& seeds)
1054 {
1055 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
1056 // this read.
1057
1058 uint64_t seed = 0;
1059 bitset<256> left = 0;
1060 bitset<256> right = 0;
1061 const char* p = seq.c_str();
1062
1063 unsigned int seq_len = (int)seq.length();
1064 const char* seq_end = p + seq_len;
1065
1066 // Build the first seed
1067 while (p < seq.c_str() + (2 * seq_key_len))
1068 {
1069 seed <<= 2;
1070 seed |= (0x3 & charToDna5[(size_t)*p]);
1071 ++p;
1072 }
1073
1074 seeds.push_back(seed);
1075
1076 // Build the rest of them with a sliding window, adding successive bases
1077 // to the "right-remainder" word.
1078 while (p < seq_end)
1079 {
1080
1081 right <<= 2;
1082 right |= (0x3 & charToDna5[(size_t)*p]);
1083 ++p;
1084 }
1085
1086 size_t cap_increase = 0;
1087
1088 // At this point, seed contains the 5'-most 2*min_anchor_len bases of the
1089 // read, and right contains everthing else on the 3' end.
1090
1091 // This loop will construct successive seed, along with 32-bit words
1092 // containing the left and right remainders for each seed
1093 uint32_t i = 0;
1094 size_t new_hits = 0;
1095 do
1096 {
1097 // Let's not make an out-of-bounds write, if this fails the global
1098 // mer_table is too small
1099 assert (!mer_table || seed < mer_table->size());
1100
1101 // How many base pairs exist in the right remainder beyond what we have
1102 // space for ?
1103 int extra_right_bp = ((int)seq.length() - (i + 2 * seq_key_len)) - 16;
1104
1105 uint32_t hit_right;
1106 if (extra_right_bp > 0)
1107 {
1108 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
1109 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
1110 }
1111 else
1112 {
1113 hit_right = (uint32_t)right.to_ulong();
1114 }
1115
1116 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
1117
1118 if (mer_table)
1119 {
1120 size_t prev_cap = (*mer_table)[seed].capacity();
1121 (*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
1122 cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
1123 }
1124 new_hits++;
1125
1126 // Take the leftmost base of the seed and stick it into bp
1127 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
1128
1129 // Move that base down to the least significant bits of bp
1130 bp >>= ((seq_key_len << 2) - 2);
1131
1132 // And tack it onto the left remainder of the read
1133 left <<= 2;
1134 left |= bp;
1135
1136 // Now take the leftmost base of the right remainder and stick it into
1137 // the rightmost position of the seed
1138 uint32_t right_len = seq_len - (i + seq_key_len * 2);
1139 //bp = right & (0x3uLL << ((right_len - 1) << 1));
1140
1141 seed <<= 2;
1142 //cout << right << endl;
1143 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
1144 //cout <<tmp_right << endl;
1145 seed |= tmp_right.to_ulong();
1146 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
1147
1148 seeds.push_back(seed);
1149
1150 //Now remove that leftmost base of the right remainder
1151 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
1152 if (right_len)
1153 {
1154 right.set(((right_len - 1) << 1), 0);
1155 right.set(((right_len - 1) << 1) + 1, 0);
1156 }
1157 ++i;
1158
1159 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
1160 return cap_increase;
1161 }
1162
1163
1164
1165 struct SeedExtension
1166 {
1167 SeedExtension(int lp,
1168 int rp,
1169 int rd_p,
1170 int le,
1171 int re,
1172 int mm) :
1173 l_pos_in_ref(lp),
1174 r_pos_in_ref(rp),
1175 read_pos(rd_p),
1176 left_extent(le),
1177 right_extent(re),
1178 mismatches(mm){}
1179
1180 int l_pos_in_ref;
1181 int r_pos_in_ref;
1182 int read_pos;
1183 int left_extent;
1184 int right_extent;
1185 int mismatches;
1186 };
1187
1188 pair<string::size_type, int> left_extend(const string& ref,
1189 const string& read,
1190 int ref_pos,
1191 int read_pos,
1192 int num_mismatches)
1193 {
1194 string::size_type ext = 0;
1195 int mm_encountered = 0;
1196 while(ref_pos >= 0 &&
1197 read_pos >= 0)
1198 {
1199 //char ref_char = ref[ref_pos];
1200 //char read_char = read[read_pos];
1201 if (ref[ref_pos] != read[read_pos])
1202 {
1203 if (mm_encountered + 1 > num_mismatches)
1204 return make_pair(ext, mm_encountered);
1205 mm_encountered++;
1206 }
1207 ext++;
1208
1209 --ref_pos;
1210 --read_pos;
1211 }
1212
1213 return make_pair(ext, mm_encountered);
1214 }
1215
1216 pair<string::size_type, int> right_extend(const string& ref,
1217 const string& read,
1218 int ref_pos,
1219 int read_pos,
1220 int num_mismatches)
1221 {
1222 string::size_type ext = 0;
1223 int mm_encountered = 0;
1224 while(ref_pos < (int)ref.size() &&
1225 read_pos < (int)read.size())
1226 {
1227 if (ref[ref_pos] != read[read_pos])
1228 {
1229 if (mm_encountered + 1 > num_mismatches)
1230 return make_pair(ext, mm_encountered);
1231 mm_encountered++;
1232 }
1233
1234 ext++;
1235
1236 ++ref_pos;
1237 ++read_pos;
1238 }
1239
1240 return make_pair(ext, mm_encountered);
1241 }
1242
1243
1244 void extend_from_seeds(vector<SeedExtension>& extensions,
1245 const PackedSplice& p,
1246 const MerTable& mer_table,
1247 const string& ref,
1248 const string& read,
1249 size_t l_pos_in_ref,
1250 size_t r_pos_in_ref,
1251 int seq_key_len)
1252 {
1253 assert(p.seed < mer_table.size());
1254 const ReadHitList& hl = mer_table[p.seed];
1255
1256 for (size_t hit = 0; hit < hl.size(); ++hit)
1257 {
1258 const ReadHit& rh = hl[hit];
1259 uint32_t pos = rh.pos;
1260
1261 pair<string::size_type, int> left_extension;
1262 pair<string::size_type, int> right_extension;
1263 left_extension = left_extend(ref, read, l_pos_in_ref - seq_key_len + 1, pos, 2);
1264 right_extension = right_extend(ref, read, r_pos_in_ref + seq_key_len, pos + 2 * seq_key_len, 2);
1265 extensions.push_back(SeedExtension(l_pos_in_ref,
1266 r_pos_in_ref,
1267 pos + seq_key_len,
1268 left_extension.first,
1269 right_extension.first,
1270 left_extension.second + right_extension.second));
1271
1272 }
1273 }
1274
1275 typedef pair<size_t, PackedSpliceHalf> SpliceHalf;
1276
1277 void get_seed_extensions(const string& ref,
1278 const string& read,
1279 int seq_key_len,
1280 MerTable& mer_table,
1281 vector<SpliceHalf>& donors,
1282 vector<SpliceHalf>& acceptors,
1283 vector<SeedExtension>& extensions)
1284 {
1285 for (size_t d = 0; d < donors.size(); ++d)
1286 {
1287 bool broke_out = false;
1288
1289 // start pos is a lower bound on downstream acceptor positions
1290 // to consider
1291 size_t start_pos = donors[d].first + min_report_intron_length;
1292
1293 SpliceHalf dummy = make_pair(start_pos,PackedSpliceHalf());
1294 vector<SpliceHalf>::iterator lb = upper_bound(acceptors.begin(),
1295 acceptors.end(),
1296 dummy);
1297
1298 if (lb == acceptors.end())
1299 break;
1300 for (size_t a = lb - acceptors.begin();
1301 a < acceptors.size();
1302 ++a)
1303 {
1304 if (acceptors[a].first - donors[d].first > (size_t)max_microexon_stretch)
1305 {
1306 broke_out = true;
1307 break;
1308 }
1309
1310 size_t l_pos_in_ref = donors[d].first - 1;
1311 size_t r_pos_in_ref = acceptors[a].first + 2;
1312 PackedSplice p = combine_splice_halves(donors[d].second,
1313 acceptors[a].second,
1314 seq_key_len);
1315 extend_from_seeds(extensions,
1316 p,
1317 mer_table,
1318 ref,
1319 read,
1320 l_pos_in_ref,
1321 r_pos_in_ref,
1322 seq_key_len);
1323 }
1324 if (broke_out)
1325 continue;
1326 }
1327
1328 }
1329
1330 void hits_from_seed_extension(uint32_t ref_id,
1331 int ref_offset,
1332 uint64_t insert_id,
1333 bool antisense,
1334 vector<SeedExtension>& extensions,
1335 vector<BowtieHit>& hits_out,
1336 int left_read_edge,
1337 int right_read_edge,
1338 int seq_key_len)
1339 {
1340 for (size_t i = 0; i < extensions.size(); ++i)
1341 {
1342 SeedExtension& s = extensions[i];
1343 if (s.read_pos >= right_read_edge ||
1344 s.read_pos < left_read_edge)
1345 continue;
1346 if (s.read_pos - seq_key_len - s.left_extent <= left_read_edge &&
1347 s.read_pos + seq_key_len + s.right_extent >= right_read_edge && s.mismatches <= 2 )
1348 {
1349 vector<CigarOp> cigar;
1350 int off_adjust;
1351 if (antisense)
1352 {
1353 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge);
1354 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1355 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos);
1356 cigar.push_back(m1);
1357 cigar.push_back(skip);
1358 cigar.push_back(m2);
1359 off_adjust = m1.length;
1360 }
1361 else
1362 {
1363 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge + 1);
1364 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1365 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos - 1);
1366 cigar.push_back(m1);
1367 cigar.push_back(skip);
1368 cigar.push_back(m2);
1369 off_adjust = m1.length;
1370 }
1371 // daehwan - check this
1372 bool end = false;
1373 BowtieHit bh(ref_id,
1374 ref_id,
1375 insert_id,
1376 ref_offset + s.l_pos_in_ref - off_adjust + 1,
1377 cigar,
1378 antisense,
1379 false,
1380 s.mismatches,
1381 0,
1382 end);
1383 hits_out.push_back(bh);
1384 }
1385 }
1386 }
1387
1388 void align(uint32_t ref_id,
1389 uint64_t insert_id,
1390 bool antisense,
1391 const string& ref,
1392 const string& read,
1393 int ref_offset,
1394 int left_read_edge,
1395 int right_read_edge,
1396 MerTable& mer_table,
1397 int seq_key_len,
1398 vector<BowtieHit>& hits_out)
1399 {
1400 // Reserve an entry for each k-mer we might see
1401 size_t mer_table_size = 1 << ((seq_key_len << 1)<<1);
1402
1403 mer_table.resize(mer_table_size);
1404
1405 vector<uint64_t> seeds;
1406 index_read(&mer_table, seq_key_len, read, 0, antisense, seeds);
1407
1408 vector<SpliceHalf> forward_donors;
1409 vector<SpliceHalf> forward_acceptors;
1410 vector<SpliceHalf> reverse_donors;
1411 vector<SpliceHalf> reverse_acceptors;
1412
1413 const string& seq = ref;
1414
1415 unsigned int pos = 0;
1416
1417
1418 for (size_t z = seq_key_len + 1; z < seq.length() - seq_key_len - 2; ++z)
1419 {
1420 char l = seq[z - 1];
1421 char r = seq[z];
1422 if (l == 'G' && r == 'T')
1423 {
1424 size_t donor_pos = pos + z - 1;
1425 size_t s = donor_pos - seq_key_len;
1426 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1427 forward_donors.push_back(make_pair(donor_pos,p));
1428 }
1429 if (l == 'A' && r == 'G')
1430 {
1431 size_t acceptor_pos = pos + z - 1;
1432 size_t s = acceptor_pos + 2 + seq_key_len;
1433 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1434 forward_acceptors.push_back(make_pair(acceptor_pos,p));
1435 }
1436 if (l == 'C' && r == 'T')
1437 {
1438 size_t acceptor_pos = pos + z - 1;
1439 size_t s = acceptor_pos - seq_key_len;
1440 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1441 reverse_acceptors.push_back(make_pair(pos + z - 1,p));
1442 }
1443 if (l == 'A' && r == 'C')
1444 {
1445 size_t donor_pos = pos + z - 1;
1446 size_t s = donor_pos + 2 + seq_key_len;
1447 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1448 reverse_donors.push_back(make_pair(donor_pos,p));
1449 }
1450 }
1451
1452 vector<SeedExtension> fwd_extensions;
1453 get_seed_extensions(seq,
1454 read,
1455 seq_key_len,
1456 mer_table,
1457 forward_donors,
1458 forward_acceptors,
1459 fwd_extensions);
1460
1461 hits_from_seed_extension(ref_id,
1462 ref_offset,
1463 insert_id,
1464 antisense,
1465 fwd_extensions,
1466 hits_out,
1467 left_read_edge,
1468 right_read_edge,
1469 seq_key_len);
1470
1471 //fprintf(stderr, "Found %d seed hits\n", fwd_extensions.size());
1472
1473 vector<SeedExtension> rev_extensions;
1474 get_seed_extensions(seq,
1475 read,
1476 seq_key_len,
1477 mer_table,
1478 reverse_donors,
1479 reverse_acceptors,
1480 rev_extensions);
1481
1482 hits_from_seed_extension(ref_id,
1483 ref_offset,
1484 insert_id,
1485 antisense,
1486 rev_extensions,
1487 hits_out,
1488 left_read_edge,
1489 right_read_edge,
1490 seq_key_len);
1491
1492 for (size_t i = 0; i < seeds.size(); ++i)
1493 mer_table[seeds[i]].clear();
1494 }
1495
1496 int extension_mismatches = 0;
1497
1498
1499 bool left_extendable_junction(uint64_t upstream_dna_str,
1500 size_t key,
1501 size_t splice_mer_len,
1502 size_t min_ext_len)
1503 {
1504 vector<MerExtension>& exts = extensions[key];
1505 for (size_t i = 0; i < exts.size(); ++i)
1506 {
1507 const MerExtension& ext = exts[i];
1508 if (ext.left_ext_len < min_ext_len)
1509 continue;
1510 uint64_t upstream = upstream_dna_str & ~(0xFFFFFFFFFFFFFFFFull << (ext.left_ext_len << 1));
1511 int mism = mismatching_bases(ext.left_dna_str, upstream, ext.left_ext_len, extension_mismatches);
1512 if (mism <= extension_mismatches)
1513 return true;
1514 }
1515 return false;
1516 }
1517
1518 bool right_extendable_junction(uint64_t downstream_dna_str,
1519 size_t key,
1520 size_t splice_mer_len,
1521 size_t min_ext_len)
1522 {
1523 vector<MerExtension>& exts = extensions[key];
1524 for (size_t i = 0; i < exts.size(); ++i)
1525 {
1526 const MerExtension& ext = exts[i];
1527 if (ext.right_ext_len < min_ext_len)
1528 continue;
1529 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull >> (ext.right_ext_len << 1));
1530 uint64_t downstream = downstream_dna_str & mask;
1531 downstream >>= ((32 - ext.right_ext_len) << 1);
1532 int mism = mismatching_bases(ext.right_dna_str, downstream, ext.right_ext_len, extension_mismatches);
1533
1534 if (mism <= extension_mismatches)
1535 return true;
1536 }
1537 return false;
1538 }
1539
1540 uint32_t junction_key(uint64_t upstream_dna_str,
1541 uint64_t downstream_dna_str,
1542 size_t splice_mer_len)
1543 {
1544 uint64_t upstream_mask = ~(0xFFFFFFFFFFFFFFFFull << (splice_mer_len));
1545 uint64_t upstream_key_half = upstream_dna_str & upstream_mask;
1546 uint64_t downstream_mask = ~(0xFFFFFFFFFFFFFFFFull >> (splice_mer_len));
1547 uint64_t downstream_key_half = (downstream_dna_str & downstream_mask) >> (64 - splice_mer_len);
1548 uint32_t key = ((uint32_t)upstream_key_half << splice_mer_len) | (uint32_t)downstream_key_half;
1549 return key;
1550 }
1551
1552 bool extendable_junction(uint64_t upstream_dna_str,
1553 uint64_t downstream_dna_str,
1554 size_t splice_mer_len,
1555 size_t min_ext_len,
1556 bool reverse,
1557 char last_in_upstream = 'N',
1558 char first_in_downstream = 'N')
1559 {
1560 if (color)
1561 {
1562 string two_bp; two_bp.push_back(last_in_upstream); two_bp.push_back(first_in_downstream);
1563 string color = convert_bp_to_color(two_bp, true);
1564 char num = (color[0] - '0') & 0x3;
1565
1566 if (reverse)
1567 {
1568 upstream_dna_str = (upstream_dna_str >> 2) << 2;
1569 upstream_dna_str |= (uint64_t)num;
1570 }
1571 else
1572 {
1573 downstream_dna_str = (downstream_dna_str << 2) >> 2;
1574 downstream_dna_str |= ((uint64_t)num << 62);
1575 }
1576 }
1577
1578 uint32_t key = junction_key(upstream_dna_str,
1579 downstream_dna_str,
1580 splice_mer_len);
1581
1582 upstream_dna_str >>= splice_mer_len;
1583 downstream_dna_str <<= splice_mer_len;
1584
1585 bool extendable = (left_extendable_junction(upstream_dna_str,
1586 key, splice_mer_len, min_ext_len) ||
1587 right_extendable_junction(downstream_dna_str,
1588 key, splice_mer_len, min_ext_len));
1589 return extendable;
1590 }
1591
1592 typedef std::set<Junction, skip_count_lt> PotentialJuncs;
1593
1594 struct RecordExtendableJuncs
1595 {
1596 void record(uint32_t ref_id,
1597 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1598 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1599 bool antisense,
1600 PotentialJuncs& juncs,
1601 int min_intron,
1602 int max_intron,
1603 size_t max_juncs,
1604 size_t half_splice_mer_len)
1605 {
1606
1607 size_t splice_mer_len = 2 * half_splice_mer_len;
1608
1609 size_t curr_R = 0;
1610 for (size_t L = 0; L < left_sites.size(); ++L)
1611 {
1612 while (curr_R < right_sites.size() &&
1613 right_sites[curr_R].first < left_sites[L].first + min_intron)
1614 {
1615 curr_R++;
1616 }
1617
1618 size_t left_pos = left_sites[L].first;
1619 size_t max_right_pos = left_pos + max_intron;
1620 for (size_t R = curr_R;
1621 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1622 {
1623 uint64_t upstream_dna_str = left_sites[L].second.fwd_string;
1624 char last_in_upstream = left_sites[L].second.last_in_string;
1625 uint64_t downstream_dna_str = right_sites[R].second.fwd_string;
1626 char first_in_downstream = right_sites[R].second.first_in_string;
1627 uint64_t rc_upstream_dna_str = left_sites[L].second.rev_string;
1628 uint64_t rc_downstream_dna_str = right_sites[R].second.rev_string;
1629
1630 if (extendable_junction(upstream_dna_str,
1631 downstream_dna_str, splice_mer_len, 7, false,
1632 last_in_upstream, first_in_downstream) ||
1633 extendable_junction(rc_downstream_dna_str,
1634 rc_upstream_dna_str, splice_mer_len, 7, true,
1635 last_in_upstream, first_in_downstream))
1636 {
1637 juncs.insert(Junction(ref_id,
1638 left_sites[L].first - 1,
1639 right_sites[R].first + 2,
1640 antisense,
1641 R - curr_R));
1642 }
1643 if (juncs.size() > max_juncs)
1644 juncs.erase(*(juncs.rbegin()));
1645 }
1646 }
1647 }
1648 };
1649
1650 struct RecordAllJuncs
1651 {
1652 void record(uint32_t ref_id,
1653 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1654 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1655 bool antisense,
1656 PotentialJuncs& juncs,
1657 int min_intron,
1658 int max_intron,
1659 size_t max_juncs,
1660 size_t half_splice_mer_len)
1661 {
1662 size_t curr_R = 0;
1663 for (size_t L = 0; L < left_sites.size(); ++L)
1664 {
1665 while (curr_R < right_sites.size() &&
1666 right_sites[curr_R].first < left_sites[L].first + min_intron)
1667 {
1668 curr_R++;
1669 }
1670
1671 size_t left_pos = left_sites[L].first;
1672 size_t max_right_pos = left_pos + max_intron;
1673 for (size_t R = curr_R;
1674 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1675 {
1676 Junction j(ref_id,
1677 left_sites[L].first - 1,
1678 right_sites[R].first + 2,
1679 antisense,
1680 R - curr_R);
1681
1682 juncs.insert(j);
1683
1684 if (juncs.size() > max_juncs)
1685 juncs.erase(*(juncs.rbegin()));
1686 }
1687 }
1688 }
1689 };
1690
1691 struct RecordSegmentJuncs
1692 {
1693 void record(uint32_t ref_id,
1694 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1695 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1696 bool antisense,
1697 PotentialJuncs& juncs,
1698 int min_intron,
1699 int max_intron,
1700 size_t max_juncs,
1701 size_t half_splice_mer_len)
1702 {
1703 if (left_sites.size() != right_sites.size())
1704 return;
1705
1706 for (size_t i = 0; i < left_sites.size(); ++i)
1707 {
1708 Junction j(ref_id,
1709 left_sites[i].first - 1,
1710 right_sites[i].first + 2,
1711 antisense);
1712
1713 juncs.insert(j);
1714 if (juncs.size() > max_juncs)
1715 juncs.erase(*(juncs.rbegin()));
1716 }
1717 }
1718 };
1719
1720 struct ButterflyKey
1721 {
1722 uint32_t pos;
1723 uint32_t key;
1724
1725 ButterflyKey(uint32_t p, uint32_t k) : pos(p), key(k) {}
1726
1727 bool operator<(const ButterflyKey& rhs) const
1728 {
1729 if (key != rhs.key)
1730 return key < rhs.key;
1731 if (pos != rhs.pos)
1732 return pos < rhs.pos;
1733 return false;
1734 }
1735
1736 bool operator==(const ButterflyKey& rhs) const
1737 {
1738 return pos == rhs.pos && key == rhs.key;
1739 }
1740 };
1741
1742 uint32_t get_left_butterfly_key(uint64_t upstream_key,
1743 const MerExtension& ext,
1744 size_t half_splice_mer_len)
1745 {
1746 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - half_splice_mer_len) << 1);
1747 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1748 uint64_t top_half = upstream_key & mask;
1749 key |= (top_half << (half_splice_mer_len << 1));
1750 return (uint32_t)key;
1751 }
1752
1753 uint32_t get_right_butterfly_key(uint64_t downstream_key,
1754 const MerExtension& ext,
1755 size_t half_splice_mer_len)
1756 {
1757 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1758 uint64_t key = (ext.left_dna_str & mask) << (half_splice_mer_len << 1);
1759 uint64_t bottom_half = (downstream_key >> (half_splice_mer_len << 1));
1760 key |= bottom_half;
1761 return (uint32_t)key;
1762 }
1763
1764 struct RecordButterflyJuncs
1765 {
1766 void record(uint32_t ref_id,
1767 const vector<pair<size_t, DnaSpliceStrings> >& all_left_sites,
1768 const vector<pair<size_t, DnaSpliceStrings> >& all_right_sites,
1769 bool antisense,
1770 PotentialJuncs& juncs,
1771 int min_intron,
1772 int max_intron,
1773 size_t max_juncs,
1774 size_t half_splice_mer_len)
1775 {
1776 size_t key_length = 2 * half_splice_mer_len;
1777 size_t extension_length = butterfly_overhang;
1778 uint64_t bottom_bit_mask = ~(0xFFFFFFFFFFFFFFFFull << (key_length<<1));
1779 uint64_t top_bit_mask = ~(0xFFFFFFFFFFFFFFFFull >> (key_length<<1));
1780
1781 if (all_left_sites.empty() || all_right_sites.empty())
1782 return;
1783
1784 size_t last_site = max(all_left_sites.back().first,
1785 all_right_sites.back().first);
1786
1787 size_t curr_left_site = 0;
1788 size_t curr_right_site = 0;
1789 for (size_t window_left_edge = 0;
1790 window_left_edge < last_site;
1791 window_left_edge += max_intron)
1792 {
1793 //fprintf(stderr, "\twindow %lu - %lu\n", window_left_edge, window_left_edge + 2 * max_intron);
1794 vector<pair<size_t, DnaSpliceStrings> > left_sites;
1795 vector<pair<size_t, DnaSpliceStrings> > right_sites;
1796
1797 while(curr_left_site < all_left_sites.size() &&
1798 all_left_sites[curr_left_site].first < window_left_edge)
1799 {
1800 curr_left_site++;
1801 }
1802
1803 while(curr_right_site < all_right_sites.size() &&
1804 all_right_sites[curr_right_site].first < window_left_edge)
1805 {
1806 curr_right_site++;
1807 }
1808
1809 for (size_t ls = curr_left_site; ls < all_left_sites.size(); ++ls)
1810 {
1811 if (all_left_sites[ls].first < window_left_edge + 2 * max_intron)
1812 {
1813 left_sites.push_back(all_left_sites[ls]);
1814 }
1815 }
1816
1817 for (size_t rs = curr_right_site; rs < all_right_sites.size(); ++rs)
1818 {
1819 if (all_right_sites[rs].first < window_left_edge + 2 * max_intron)
1820 {
1821 right_sites.push_back(all_right_sites[rs]);
1822 }
1823 }
1824
1825 vector<ButterflyKey> left_keys;
1826 for (size_t L = 0; L < left_sites.size(); ++L)
1827 {
1828 uint64_t fwd_upstream_dna_str = left_sites[L].second.fwd_string;
1829 uint64_t fwd_upstream_key = fwd_upstream_dna_str & bottom_bit_mask;
1830
1831 assert (fwd_upstream_key < extensions.size());
1832
1833 vector<MerExtension>& fwd_exts = extensions[fwd_upstream_key];
1834 for (size_t i = 0; i < fwd_exts.size(); ++i)
1835 {
1836 const MerExtension& ext = fwd_exts[i];
1837 if (ext.right_ext_len < extension_length)
1838 continue;
1839
1840 /*
1841 < f_u_key ><ext.right>
1842 NNNNNNNNNN GT
1843 */
1844
1845 // take the top bits of the right extension
1846 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - extension_length) << 1);
1847
1848 // and the bottom bits of the site key
1849 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1850 uint64_t top_half = fwd_upstream_key & mask;
1851
1852 // and cat them together
1853 key |= (top_half << (extension_length << 1));
1854 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1855 }
1856
1857 uint64_t rev_upstream_dna_str = left_sites[L].second.rev_string;
1858 uint64_t rev_upstream_key = (rev_upstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1859
1860 assert (rev_upstream_key < extensions.size());
1861
1862 vector<MerExtension>& rev_exts = extensions[rev_upstream_key];
1863 for (size_t i = 0; i < rev_exts.size(); ++i)
1864 {
1865 const MerExtension& ext = rev_exts[i];
1866 if (ext.left_ext_len < extension_length)
1867 continue;
1868
1869 /*
1870 < r_u_key ><ext.left>
1871 NNNNNNNNNN GT
1872 */
1873
1874 // reverse complement the left extension, and we will need
1875 // what were the bottom bits. these become the top bits in the
1876 // rc.
1877 uint64_t ext_str = color ? rc_color_str(ext.left_dna_str) : rc_dna_str(ext.left_dna_str);
1878 ext_str >>= 64 - (ext.left_ext_len << 1);
1879
1880 // now take the top bits of the rc, make them the bottom of
1881 // the key
1882 uint64_t key = ext_str >> ((ext.left_ext_len - extension_length) << 1);
1883
1884 // now add in the seed key bottom bits, making them the top of
1885 // the key
1886 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1887 uint64_t top_half = fwd_upstream_key & mask;
1888 key |= (top_half << (extension_length << 1));
1889 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1890 }
1891 }
1892 sort (left_keys.begin(), left_keys.end());
1893 vector<ButterflyKey>::iterator new_end = unique(left_keys.begin(), left_keys.end());
1894 left_keys.erase(new_end, left_keys.end());
1895
1896 vector<ButterflyKey> right_keys;
1897 for (size_t R = 0; R < right_sites.size(); ++R)
1898 {
1899 uint64_t fwd_downstream_dna_str = right_sites[R].second.fwd_string;
1900 uint64_t fwd_downstream_key = (fwd_downstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1901
1902 assert (fwd_downstream_key < extensions.size());
1903
1904 vector<uint64_t> fwd_downstream_keys;
1905 if (color)
1906 {
1907 for(size_t color_value = 0; color_value < 4; ++color_value)
1908 {
1909 uint64_t tmp_key = (fwd_downstream_key << 2) >> 2 | (color_value << ((key_length - 1) << 1));
1910 fwd_downstream_keys.push_back(tmp_key);
1911 }
1912 }
1913 else
1914 {
1915 fwd_downstream_keys.push_back(fwd_downstream_key);
1916 }
1917
1918 for(size_t key = 0; key < fwd_downstream_keys.size(); ++key)
1919 {
1920 uint64_t tmp_fwd_downstream_key = fwd_downstream_keys[key];
1921 vector<MerExtension>& fwd_exts = extensions[tmp_fwd_downstream_key];
1922 for (size_t i = 0; i < fwd_exts.size(); ++i)
1923 {
1924 const MerExtension& ext = fwd_exts[i];
1925 if (ext.left_ext_len < extension_length)
1926 continue;
1927
1928 /*
1929 <ext.left>< f_d_key >
1930 AG NNNNNNNNNN
1931 */
1932
1933 // take the bottom bits of the left extension, making them the
1934 // top of the key.
1935 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1936 uint64_t key = (ext.left_dna_str & mask) << (extension_length << 1);
1937
1938 // add in the top bits of the seed key, making them the bottom bits
1939 // of the key.
1940 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1941 key |= bottom_half;
1942 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1943 }
1944 }
1945
1946 uint64_t rev_downstream_dna_str = right_sites[R].second.rev_string;
1947 uint64_t rev_downstream_key = rev_downstream_dna_str & bottom_bit_mask;
1948
1949 assert (rev_downstream_key < extensions.size());
1950
1951 vector<uint64_t> rev_downstream_keys;
1952 if (color)
1953 {
1954 for(size_t color_value = 0; color_value < 4; ++color_value)
1955 {
1956 uint64_t tmp_key = (rev_downstream_key >> 2) << 2 | color_value;
1957 rev_downstream_keys.push_back(tmp_key);
1958 }
1959 }
1960 else
1961 {
1962 rev_downstream_keys.push_back(rev_downstream_key);
1963 }
1964
1965 for(size_t key = 0; key < rev_downstream_keys.size(); ++key)
1966 {
1967 uint64_t tmp_rev_downstream_key = rev_downstream_keys[key];
1968 uint64_t tmp_fwd_downstream_key = fwd_downstream_key;
1969 if (color)
1970 {
1971 tmp_fwd_downstream_key = rc_color_str(tmp_rev_downstream_key) >> (64 - (key_length << 1));
1972 }
1973
1974 vector<MerExtension>& rev_exts = extensions[tmp_rev_downstream_key];
1975 for (size_t i = 0; i < rev_exts.size(); ++i)
1976 {
1977 const MerExtension& ext = rev_exts[i];
1978 if (ext.right_ext_len < extension_length)
1979 continue;
1980
1981 /*
1982 <ext.right>< r_d_key >
1983 AG NNNNNNNNNN
1984 */
1985
1986 // reverse complement the right_extension. we want the
1987 // top bits of the extension, but these become the bottom bits
1988 // under the rc.
1989 uint64_t ext_str = color ? rc_color_str(ext.right_dna_str) : rc_dna_str(ext.right_dna_str);
1990 ext_str >>= 64 - (ext.right_ext_len << 1);
1991
1992 // take the bottom bits of the rc and make it the top of the key
1993 uint64_t key = ext_str << (extension_length << 1);
1994
1995 // take the top bits of the seed key and make them the bottom
1996 // of the key.
1997 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1998 key |= bottom_half;
1999
2000 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
2001 }
2002 }
2003 }
2004
2005 sort (right_keys.begin(), right_keys.end());
2006 new_end = unique(right_keys.begin(), right_keys.end());
2007 right_keys.erase(new_end, right_keys.end());
2008
2009 size_t lk = 0;
2010 size_t rk = 0;
2011
2012 while (lk < left_keys.size() && rk < right_keys.size())
2013 {
2014 while (lk < left_keys.size() &&
2015 left_keys[lk].key < right_keys[rk].key) { ++lk; }
2016
2017 if (lk == left_keys.size())
2018 break;
2019
2020 while (rk < right_keys.size() &&
2021 right_keys[rk].key < left_keys[lk].key) { ++rk; }
2022
2023 if (rk == right_keys.size())
2024 break;
2025
2026 if (lk < left_keys.size() && rk < right_keys.size() &&
2027 right_keys[rk].key == left_keys[lk].key)
2028 {
2029
2030 size_t k = right_keys[rk].key;
2031 size_t lk_end = lk;
2032 size_t rk_end = rk;
2033 while (rk_end < right_keys.size() && right_keys[rk_end].key == k) {++rk_end;}
2034 while (lk_end < left_keys.size() && left_keys[lk_end].key == k) {++lk_end;}
2035
2036 size_t tmp_lk = lk;
2037
2038 while (tmp_lk < lk_end)
2039 {
2040 size_t tmp_rk = rk;
2041 while (tmp_rk < rk_end)
2042 {
2043 int donor = (int)left_keys[tmp_lk].pos - 1;
2044 int acceptor = (int)right_keys[tmp_rk].pos + 2;
2045
2046 if (acceptor - donor > min_intron && acceptor - donor < max_intron)
2047 {
2048 Junction j(ref_id,
2049 donor,
2050 acceptor,
2051 antisense,
2052 acceptor - donor); // just prefer shorter introns
2053 juncs.insert(j);
2054 if (juncs.size() > max_juncs)
2055 {
2056 juncs.erase(*(juncs.rbegin()));
2057 }
2058 }
2059 ++tmp_rk;
2060 }
2061
2062 ++tmp_lk;
2063 }
2064
2065 lk = lk_end;
2066 rk = rk_end;
2067 }
2068 }
2069 }
2070 }
2071 };
2072
2073 template <class JunctionRecorder>
2074 void juncs_from_ref_segs(RefSequenceTable& rt,
2075 vector<RefSeg>& expected_don_acc_windows,
2076 PotentialJuncs& juncs,
2077 const DnaString& donor_dinuc,
2078 const DnaString& acceptor_dinuc,
2079 int max_intron,
2080 int min_intron,
2081 size_t max_juncs,
2082 bool talkative,
2083 size_t half_splice_mer_len)
2084 {
2085
2086 typedef map<uint32_t, IntronMotifs> MotifMap;
2087
2088 MotifMap ims;
2089
2090 seqan::DnaStringReverseComplement rev_donor_dinuc(donor_dinuc);
2091 seqan::DnaStringReverseComplement rev_acceptor_dinuc(acceptor_dinuc);
2092
2093 if (talkative)
2094 fprintf(stderr, "Collecting potential splice sites in islands\n");
2095
2096 //
2097 bool all_both = true;
2098 for (size_t r = 0; r < expected_don_acc_windows.size(); ++r)
2099 {
2100 const RefSeg& seg = expected_don_acc_windows[r];
2101
2102 if (seg.points_where != POINT_DIR_BOTH)
2103 all_both = false;
2104
2105 RefSequenceTable::Sequence* ref_str = rt.get_seq(seg.ref_id);
2106
2107 if (!ref_str)
2108 continue;
2109
2110 bool skip_fwd = false;
2111 bool skip_rev = false;
2112
2113 if (library_type == FR_FIRSTSTRAND)
2114 {
2115 if (seg.read == READ_LEFT)
2116 {
2117 if (seg.antisense) skip_rev = true;
2118 else skip_fwd = true;
2119 }
2120 else if(seg.read == READ_RIGHT)
2121 {
2122 if (seg.antisense) skip_fwd = true;
2123 else skip_rev = true;
2124 }
2125 }
2126
2127 if (library_type == FR_SECONDSTRAND)
2128 {
2129 if (seg.read == READ_LEFT)
2130 {
2131 if (seg.antisense) skip_fwd = true;
2132 else skip_rev = true;
2133 }
2134 else if(seg.read == READ_RIGHT)
2135 {
2136 if (seg.antisense) skip_rev = true;
2137 else skip_fwd = true;
2138 }
2139 }
2140
2141 pair<map<uint32_t, IntronMotifs>::iterator, bool> ret =
2142 ims.insert(make_pair(seg.ref_id, IntronMotifs(seg.ref_id)));
2143
2144 IntronMotifs& motifs = ret.first->second;
2145
2146 int left_color_offset = 0, right_color_offset = 0;
2147 if (color)
2148 {
2149 if (seg.antisense)
2150 right_color_offset = 1;
2151 else
2152 left_color_offset = -1;
2153 }
2154
2155 if (seg.left + left_color_offset < 0 || seg.right + right_color_offset >= (int)length(*ref_str) - 1)
2156 continue;
2157
2158 DnaString org_seg_str = seqan::infix(*ref_str, seg.left + left_color_offset, seg.right + right_color_offset);
2159 String<char> seg_str;
2160 assign(seg_str, org_seg_str);
2161
2162 #ifdef B_DEBUG2
2163 cout << "coord: " << seg.left << " " << seg.right << endl;
2164 //<< "seg_str: " << seg_str << endl;
2165 #endif
2166 if (color)
2167 {
2168 bool remove_primer = true;
2169 seg_str = convert_bp_to_color(org_seg_str, remove_primer);
2170 }
2171
2172 size_t to = 0;
2173 size_t seg_len = length(seg_str);
2174 size_t read_len = seg.support_read.size();
2175 if (read_len <= 0)
2176 to = seg_len - 2;
2177 else
2178 to = read_len - 2;
2179
2180 const size_t max_segment_len = 128;
2181 uint8_t left_mismatches[max_segment_len] = {0,};
2182 uint8_t right_mismatches[max_segment_len] = {0,};
2183
2184 if (max_segment_len < read_len)
2185 {
2186 fprintf(stderr, "Error: read len(%d) is greater than %d\n", (int)read_len, (int)max_segment_len);
2187 exit(-1);
2188 }
2189
2190 if (read_len == seg_len || seg.points_where == POINT_DIR_BOTH)
2191 {
2192 if (seg.points_where == POINT_DIR_RIGHT || seg.points_where == POINT_DIR_BOTH)
2193 {
2194 size_t num_mismatches = 0;
2195 for (size_t i = 0; i < read_len - 1; ++i)
2196 {
2197 if (seg_str[i] != seg.support_read[i])
2198 ++num_mismatches;
2199
2200 left_mismatches[i] = num_mismatches;
2201 if (num_mismatches > 2)
2202 {
2203 to = i;
2204 break;
2205 }
2206 }
2207 }
2208
2209 if (seg.points_where == POINT_DIR_LEFT || seg.points_where == POINT_DIR_BOTH)
2210 {
2211 size_t num_mismatches = 0;
2212 for (int i = read_len - 1; i >= 0; --i)
2213 {
2214 if (seg_str[i + (seg_len - read_len)] != seg.support_read[i])
2215 ++num_mismatches;
2216
2217 right_mismatches[i] = num_mismatches;
2218 if (num_mismatches > 2)
2219 break;
2220 }
2221 }
2222
2223 // daehwan
2224 #ifdef B_DEBUG2
2225 cout << "antisense: " << (seg.antisense ? "-" : "+") << endl
2226 << seqan::infix(seg_str, 0, segment_length) << " "
2227 << seqan::infix(seg_str, length(seg_str) - segment_length, length(seg_str)) << endl
2228 << seg.support_read << endl
2229 << 0 << " - " << to << endl;
2230
2231 for (unsigned int i = 0; i < read_len; ++i)
2232 cout << (int)left_mismatches[i];
2233 cout << "\t";
2234
2235 for (unsigned int i = 0; i < read_len; ++i)
2236 cout << (int)right_mismatches[i];
2237 cout << endl;
2238 #endif
2239 }
2240
2241 if (seg.points_where == POINT_DIR_BOTH)
2242 {
2243 for (size_t i = 0; i <= to; ++i)
2244 {
2245 // Look at a slice of the reference without creating a copy.
2246 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2247
2248 if ((!skip_fwd && curr == donor_dinuc) || (!skip_rev && curr == rev_acceptor_dinuc))
2249 {
2250 DnaString partner;
2251 if (curr == donor_dinuc)
2252 partner = acceptor_dinuc;
2253 else
2254 partner = rev_donor_dinuc;
2255
2256 uint8_t left_mismatch = 0;
2257 if (i > 0)
2258 left_mismatch = left_mismatches[i-1];
2259
2260 // daehwan
2261 #ifdef B_DEBUG2
2262 cout << "i: " << i << endl
2263 << "mismatches: " << (int)left_mismatch
2264 << " - " << (int)right_mismatches[i] << endl;
2265 #endif
2266 if (left_mismatch + right_mismatches[i] <= 2)
2267 {
2268 size_t pos = length(seg_str) - (read_len - i) - 2;
2269 if (partner == seqan::infix(org_seg_str, pos - left_color_offset, pos + 2 - left_color_offset))
2270 {
2271 if (curr == donor_dinuc)
2272 {
2273 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2274 motifs.fwd_acceptors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2275 }
2276 else
2277 {
2278 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2279 motifs.rev_donors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2280 }
2281
2282 // daehwan
2283 #ifdef B_DEBUG2
2284 cout << curr << ":" << partner << " added" << endl;
2285 #endif
2286 }
2287 }
2288 }
2289 }
2290 }
2291
2292 else if (seg.points_where == POINT_DIR_LEFT)
2293 {
2294 // A ref segment that "points left" is one that was flanked
2295 // on the right by a partial bowtie hit, indicating that we
2296 // should be looking for an intron to the left of the hit
2297 // In this seg, that means either an "AG" or an "AC"
2298 for (size_t i = 0; i <= to; ++i)
2299 {
2300 // Look at a slice of the reference without creating a copy.
2301 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2302 if (curr == acceptor_dinuc && !skip_fwd)
2303 motifs.fwd_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2304 else if (curr == rev_donor_dinuc && !skip_rev)
2305 motifs.rev_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2306 }
2307 }
2308 else
2309 {
2310 // A right pointing ref seg wants either a "GT" or a "CT"
2311 for (size_t i = 0; i <= to; ++i)
2312 {
2313 // Look at a slice of the reference without creating a copy.
2314 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2315 if (curr == donor_dinuc && !skip_fwd)
2316 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2317 else if (curr == rev_acceptor_dinuc && !skip_rev)
2318 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2319 }
2320 }
2321 }
2322
2323 if (talkative)
2324 {
2325 fprintf(stderr, "reporting synthetic splice junctions...\n");
2326 }
2327 for (MotifMap::iterator motif_itr = ims.begin(); motif_itr != ims.end(); ++motif_itr)
2328 {
2329 uint32_t ref_id = motif_itr->first;
2330
2331 RefSequenceTable::Sequence* ref_str = rt.get_seq(ref_id);
2332
2333 if (!ref_str)
2334 err_die("Error: couldn't get ref string for %u\n", ref_id);
2335
2336 if (talkative)
2337 fprintf(stderr, "Examining donor-acceptor pairings in %s\n", rt.get_name(ref_id));
2338 IntronMotifs& motifs = motif_itr->second;
2339
2340 if (!all_both)
2341 motifs.unique();
2342
2343 //motifs.attach_mer_counts(*ref_str);
2344 motifs.attach_mers(*ref_str);
2345
2346 vector<pair<size_t, DnaSpliceStrings> >& fwd_donors = motifs.fwd_donors;
2347 vector<pair<size_t, DnaSpliceStrings> >& fwd_acceptors = motifs.fwd_acceptors;
2348 vector<pair<size_t, DnaSpliceStrings> >& rev_acceptors = motifs.rev_acceptors;
2349 vector<pair<size_t, DnaSpliceStrings> >& rev_donors = motifs.rev_donors;
2350
2351 //const char* ref_name = rt.get_name(motif_itr->second.ref_id);
2352
2353 JunctionRecorder recorder;
2354 recorder.record(ref_id,
2355 fwd_donors,
2356 fwd_acceptors,
2357 false,
2358 juncs,
2359 min_intron,
2360 max_intron,
2361 max_juncs,
2362 half_splice_mer_len);
2363
2364 recorder.record(ref_id,
2365 rev_acceptors,
2366 rev_donors,
2367 true,
2368 juncs,
2369 min_intron,
2370 max_intron,
2371 max_juncs,
2372 half_splice_mer_len);
2373 }
2374 //fprintf(stderr, "Found %d total splices\n", num_juncs);
2375 }
2376
2377
2378 /**
2379 * Performs a simple global alignment.
2380 * This function will perform a restricted global alignment. The restriction is that only one insertion/deletion
2381 * is allowed in the final alignment.
2382 * @param shortSequence The short sequence to be aligned.
2383 * @param leftReference The left end of the reference to be aligned, must be exactly as long as the short sequence
2384 * @param leftReference The right end of the reference to be aligned, must be exactly as long as the short sequenc
2385 * @param insertPosition This will contain the 0-based index of the first position in the shorter sequence after the insertion/deletion. A value of -1 indicates that the alignment could not be performed.
2386 * @param mismatchCount This will contain the number of mismatches in the optimal restricted global alignment. The number and length of insertions/deletions is fixed. A value of -1 indicates that the alignment could not be performed.
2387 */
2388 void simpleSplitAlignment(seqan::String<char>& shorterSequence,
2389 seqan::String<char>& leftReference,
2390 seqan::String<char>& rightReference,
2391 vector<int>& insertPositions,
2392 int& mismatchCount)
2393 {
2394 /*
2395 * In this restricted alignment, we already know the length and number (1) of insertions/deletions.
2396 * We simply need to know where to put it. Do a linear scan through sequence counting the number of induced
2397 * errors before and after putting the insertion at each sequence.
2398 */
2399
2400 /*
2401 * Note that we could have a case, where both the alignment and the read have the unknown
2402 * nucleotide ('N') and we don't want to reward cases where these characters match
2403 */
2404 vector<unsigned short> beforeErrors(seqan::length(shorterSequence));
2405 for(int idx = seqan::length(shorterSequence) - 1; idx >= 0; idx -= 1){
2406 unsigned short prevCount = 0;
2407 /*
2408 * We guarentee idx >= 0, so cast to hide the compiler
2409 * warning here
2410 */
2411 if(((size_t)idx) < seqan::length(shorterSequence) - 1){
2412 prevCount = beforeErrors.at(idx + 1);
2413 }
2414 unsigned short currentMismatch = 0;
2415 if(rightReference[idx] == 'N' || shorterSequence[idx] == 'N' || rightReference[idx] != shorterSequence[idx]){
2416 currentMismatch = 1;
2417 }
2418 beforeErrors.at(idx) = prevCount + currentMismatch;
2419 }
2420
2421 vector<unsigned short> afterErrors(seqan::length(shorterSequence));
2422 for(size_t idx = 0; idx < seqan::length(shorterSequence) ; idx += 1){
2423 unsigned short prevCount = 0;
2424 if(idx > 0){
2425 prevCount = afterErrors.at(idx - 1);
2426 }
2427 unsigned short currentMismatch = 0;
2428 if(leftReference[idx] == 'N' || shorterSequence[idx] == 'N' || leftReference[idx] != shorterSequence[idx]){
2429 currentMismatch = 1;
2430 }
2431 afterErrors.at(idx) = prevCount + currentMismatch;
2432 }
2433
2434 mismatchCount = seqan::length(shorterSequence) + 1;
2435 insertPositions.clear();
2436
2437 /*
2438 * Technically, we could allow the insert position to be at the end or beginning of the sequence,
2439 * but we are disallowing it here
2440 */
2441 for(size_t currentInsertPosition = 1; currentInsertPosition < seqan::length(shorterSequence); currentInsertPosition += 1){
2442 size_t errorCount = beforeErrors.at(currentInsertPosition) + afterErrors.at(currentInsertPosition - 1);
2443
2444 if(((int)errorCount) < mismatchCount){
2445 mismatchCount = (int)errorCount;
2446 insertPositions.clear();
2447 insertPositions.push_back(currentInsertPosition);
2448 }
2449 else if ((int)errorCount == mismatchCount) {
2450 insertPositions.push_back(currentInsertPosition);
2451 }
2452 }
2453 return;
2454 }
2455
2456
2457 /**
2458 * Try to detect a small insertion.
2459 * This code will try to identify a small insertion based on the ungapped alignment of two neighboring
2460 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2461 * number of errors. Note that the function makes use of the global parameter "max_insertion_length" to limit the maximum
2462 * size of a detected insertion.
2463 * @param rt Sequence table used to lookup sequence information
2464 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2465 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2466 * @param insertions If an insertion is sucessfully detected, it will be added to this set
2467 */
2468 void detect_small_insertion(RefSequenceTable& rt,
2469 seqan::String<char>& read_sequence,
2470 BowtieHit& leftHit,
2471 BowtieHit& rightHit,
2472 std::set<Insertion>& insertions)
2473 {
2474
2475 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2476 if(!ref_str){
2477 fprintf(stderr, "Error accessing sequence record\n");
2478 }else{
2479 size_t read_length = seqan::length(read_sequence);
2480 int begin_offset = 0;
2481 int end_offset = 0;
2482
2483 if(color){
2484 if(leftHit.antisense_align())
2485 end_offset = 1;
2486 else
2487 begin_offset = -1;
2488 }
2489
2490 if(leftHit.left() + begin_offset < 0)
2491 return;
2492
2493 /*
2494 * If there is in fact a deletion, we are expecting the genomic sequence to be shorter than
2495 * the actual read sequence
2496 */
2497 int discrepancy = read_length - (rightHit.right() - leftHit.left());
2498 DnaString genomic_sequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, rightHit.right() + end_offset);
2499 String<char> genomic_sequence;
2500 assign(genomic_sequence, genomic_sequence_temp);
2501
2502 if(color)
2503 genomic_sequence = convert_bp_to_color(genomic_sequence, true);
2504
2505 String<char> left_read_sequence = seqan::infix(read_sequence, 0, 0 + seqan::length(genomic_sequence));
2506 String<char> right_read_sequence = seqan::infix(read_sequence, read_length - seqan::length(genomic_sequence), read_length);
2507
2508 vector<int> bestInsertPositions;
2509 int minErrors = -1;
2510 simpleSplitAlignment(genomic_sequence, left_read_sequence, right_read_sequence, bestInsertPositions, minErrors);
2511
2512 assert (bestInsertPositions.size() > 0);
2513 int bestInsertPosition = bestInsertPositions[0];
2514
2515
2516 /*
2517 * Need to decide if the insertion is suitably improves the alignment
2518 */
2519 /*
2520 * If these two segments anchors constitue the entire read, then we require
2521 * that this alignment actually improve the number of errors observed in the alignment
2522 * Otherwise, it is OK as long as the number of errors doesn't increase.
2523 */
2524 int adjustment = 0;
2525 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2526 adjustment = -1;
2527 }
2528 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2529 String<char> insertedSequence = seqan::infix(left_read_sequence, bestInsertPosition, bestInsertPosition + discrepancy);
2530 if(color)
2531 insertedSequence = convert_color_to_bp((char)(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1]), insertedSequence);
2532
2533 insertions.insert(Insertion(leftHit.ref_id(),
2534 leftHit.left() + bestInsertPosition - 1 + end_offset,
2535 seqan::toCString(insertedSequence)));
2536 }
2537 }
2538 return;
2539 }
2540
2541 /**
2542 * Try to detect a small deletion.
2543 * This code will try to identify a small deletion based on the ungapped alignment of two neighboring
2544 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2545 * number of errors. Note that the function makes use of the global parameter "max_deletion_length" to limit the maximum
2546 * size of a detected deletion.
2547 * @param rt Sequence table used to lookup sequence information
2548 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2549 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2550 * @param deletion_juncs If a deletion is sucessfully detected, it will be added to this set
2551 */
2552 void detect_small_deletion(RefSequenceTable& rt,
2553 seqan::String<char>& read_sequence,
2554 BowtieHit& leftHit,
2555 BowtieHit& rightHit,
2556 std::set<Deletion>& deletions)
2557 {
2558 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2559 if(!ref_str){
2560 fprintf(stderr, "Error accessing sequence record\n");
2561 }else{
2562 int begin_offset = 0;
2563 int end_offset = 0;
2564
2565 if(color){
2566 if(leftHit.antisense_align())
2567 end_offset = 1;
2568 else
2569 begin_offset = -1;
2570 }
2571
2572 if(leftHit.left() + begin_offset < 0)
2573 return;
2574
2575 size_t read_length = seqan::length(read_sequence);
2576 if(rightHit.right() + read_length + begin_offset < 0 )
2577 return;
2578
2579 int discrepancy = (rightHit.right() - leftHit.left()) - read_length;
2580 Dna5String leftGenomicSequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, leftHit.left() + read_length + end_offset);
2581 Dna5String rightGenomicSequence_temp = seqan::infix(*ref_str, rightHit.right() - read_length + begin_offset, rightHit.right() + end_offset);
2582
2583 if (length(leftGenomicSequence_temp) < read_length || length(rightGenomicSequence_temp) < read_length)
2584 return;
2585
2586 String<char> leftGenomicSequence;
2587 assign(leftGenomicSequence, leftGenomicSequence_temp);
2588
2589 String<char> rightGenomicSequence;
2590 assign(rightGenomicSequence, rightGenomicSequence_temp);
2591
2592 if(color){
2593 leftGenomicSequence = convert_bp_to_color(leftGenomicSequence, true);
2594 rightGenomicSequence = convert_bp_to_color(rightGenomicSequence, true);
2595 }
2596
2597 vector<int> bestInsertPositions;
2598 int minErrors = -1;
2599 simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPositions, minErrors);
2600
2601 assert (bestInsertPositions.size() > 0);
2602 int bestInsertPosition = bestInsertPositions[0];
2603
2604 /*
2605 * Need to decide if the deletion is suitably improves the alignment
2606 */
2607 int adjustment = 0;
2608
2609 /*
2610 * If these two segments anchors constitue the entire read, then we require
2611 * that this alignment actually improve the number of errors observed in the alignment
2612 * Otherwise, it is OK as long as the number of errors doesn't increase.
2613 */
2614 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2615 adjustment = -1;
2616 }
2617 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2618 deletions.insert(Deletion(leftHit.ref_id(),
2619 leftHit.left() + bestInsertPosition - 1 + end_offset,
2620 leftHit.left() + bestInsertPosition + discrepancy + end_offset,
2621 false));
2622 }
2623 }
2624 return;
2625 }
2626
2627 void gappedAlignment(const seqan::String<char>& read,
2628 const seqan::String<char>& leftReference,
2629 const seqan::String<char>& rightReference,
2630 vector<int>& insertLeftPositions,
2631 vector<int>& insertRightPositions,
2632 int& mismatchCount)
2633 {
2634 const Score<int> globalScore(0, -5, -1, -10); // (match, mismatch, gapextend, gapopen)
2635 Align<String<char> > align;
2636 appendValue(rows(align), read);
2637
2638 String<char> genomicSequence;
2639 assign(genomicSequence, leftReference);
2640 append(genomicSequence, rightReference);
2641 appendValue(rows(align), genomicSequence);
2642 int score = globalAlignment(align, globalScore);
2643
2644 Row<Align<String<char> > >::Type& row0 = row(align, 0);
2645 Row<Align<String<char> > >::Type& row1 = row(align, 1);
2646
2647 // find gap whose length >= read_len - 10
2648 int start_in_align = -1, end_in_align = -1;
2649 int start_in_ref = -1, end_in_ref = -1;
2650
2651 int temp_start = -1;
2652 int ref_pos = 0;
2653
2654 int gap = 0;
2655 mismatchCount = 0;
2656
2657 int len = length(row0);
2658 for (int i = 0; i < len; ++i)
2659 {
2660 if (row0[i] == '-')
2661 {
2662 if (temp_start < 0)
2663 temp_start = i;
2664 }
2665 else if (row1[i] != '-')
2666 {
2667 if (temp_start >= 0)
2668 {
2669 if (i - temp_start > end_in_align - start_in_align)
2670 {
2671 end_in_align = i;
2672 start_in_align = temp_start;
2673
2674 end_in_ref = ref_pos;
2675 start_in_ref = ref_pos - (i - temp_start);
2676 temp_start = -1;
2677 }
2678 }
2679
2680 if (row0[i] != row1[i])
2681 ++mismatchCount;
2682 }
2683
2684 if (row0[i] == '-' || row1[i] == '-')
2685 ++gap;
2686 if (row1[i] != '-')
2687 ++ref_pos;
2688 }
2689
2690 // assume the lengths of read, leftReference, and rightReference are all equal.
2691 const int max_gap = end_in_align - start_in_align;
2692 if (max_gap < length(read) - 10)
2693 return;
2694
2695 if (start_in_ref < 0)
2696 return;
2697
2698 insertLeftPositions.push_back(start_in_ref);
2699 insertRightPositions.push_back(end_in_ref - length(leftReference));
2700
2701 #if B_DEBUG
2702 if (gap - max_gap >= 0)
2703 {
2704 cerr << "Score = " << score << endl;
2705 cerr << row(align, 0) << endl
2706 << row(align, 1) << endl;
2707
2708 cerr << "len: " << len
2709 << ", gap: " << gap
2710 << ", max_gap: " << max_gap
2711 << "(" << start_in_align << ", " << end_in_align
2712 << "), ref (" << start_in_ref << ", " << end_in_ref
2713 << "), mismatch: " << mismatchCount << endl;
2714
2715 if (gap - max_gap > 0)
2716 cerr << "daehwan" << endl;
2717 }
2718 #endif
2719 }
2720
2721 void detect_fusion(RefSequenceTable& rt,
2722 seqan::String<char>& read_sequence,
2723 BowtieHit& leftHit,
2724 BowtieHit& rightHit,
2725 FusionSimpleSet& fusions,
2726 uint32_t dir)
2727 {
2728 RefSequenceTable::Sequence* left_ref_str = rt.get_seq(leftHit.ref_id());
2729 RefSequenceTable::Sequence* right_ref_str = rt.get_seq(rightHit.ref_id());
2730
2731 size_t read_length = seqan::length(read_sequence);
2732
2733 Dna5String leftGenomicSequence_temp;
2734 Dna5String rightGenomicSequence_temp;
2735
2736 if (dir == FUSION_FF || dir == FUSION_FR)
2737 {
2738 if (leftHit.left() + read_length > seqan::length(*left_ref_str))
2739 return;
2740
2741 leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.left(), leftHit.left() + read_length);
2742 }
2743 else
2744 {
2745 if (leftHit.right() < read_length)
2746 return;
2747
2748 leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.right() - read_length, leftHit.right());
2749 seqan::reverseComplement(leftGenomicSequence_temp);
2750 }
2751
2752 if (dir == FUSION_FF || dir == FUSION_RF)
2753 {
2754 if (rightHit.right() < read_length)
2755 return;
2756
2757 rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.right() - read_length, rightHit.right());
2758 }
2759 else
2760 {
2761 if (rightHit.left() + read_length > seqan::length(*right_ref_str))
2762 return;
2763
2764 rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.left(), rightHit.left() + read_length);
2765 seqan::reverseComplement(rightGenomicSequence_temp);
2766 }
2767
2768 String<char> leftGenomicSequence;
2769 assign(leftGenomicSequence, leftGenomicSequence_temp);
2770
2771 String<char> rightGenomicSequence;
2772 assign(rightGenomicSequence, rightGenomicSequence_temp);
2773
2774 vector<int> bestLeftInsertPositions;
2775 vector<int> bestRightInsertPositions;
2776 int minErrors = -1;
2777
2778 // todo - we need to do (efficient) Smith-Waterman Alignment using SIMD like the way Bowtie2 does!
2779 // too slow and too many false positives
2780 // daehwan - turn off this for now.
2781 if (bowtie2 && false)
2782 gappedAlignment(read_sequence,
2783 leftGenomicSequence,
2784 rightGenomicSequence,
2785 bestLeftInsertPositions,
2786 bestRightInsertPositions,
2787 minErrors);
2788 else
2789 simpleSplitAlignment(read_sequence,
2790 leftGenomicSequence,
2791 rightGenomicSequence,
2792 bestLeftInsertPositions,
2793 minErrors);
2794
2795 uint32_t total_edit_dist = leftHit.edit_dist() + rightHit.edit_dist();
2796 if (minErrors > total_edit_dist)
2797 return;
2798
2799 #if 1
2800 if (minErrors > 2)
2801 return;
2802
2803 for (size_t i = 0; i < bestLeftInsertPositions.size(); ++i)
2804 {
2805 const int left = bestLeftInsertPositions[i];
2806 if (left < fusion_anchor_length)
2807 return;
2808
2809 const int right = bestRightInsertPositions.size() > i ? bestRightInsertPositions[i] : left;
2810 if (length(rightGenomicSequence) - right < fusion_anchor_length)
2811 return;
2812 }
2813
2814 // daehwan - this is very slow - the older version of "difference" is way faster
2815 #else
2816 if (read_length <= 60)
2817 {
2818 /*
2819 * check if the two contig from two different chromosome are different enough
2820 */
2821 const Score<int> globalScore(0, -1, -2, -2);
2822 Align<String<char> > align;
2823 appendValue(rows(align), read_sequence);
2824 appendValue(rows(align), leftGenomicSequence);
2825
2826 int score = globalAlignment(align, globalScore);
2827 assignSource(row(align, 0), read_sequence);
2828 assignSource(row(align, 1), rightGenomicSequence);
2829
2830 score = max(score, globalAlignment(align, globalScore));
2831 if (abs(score) < read_length / 6)
2832 return;
2833 }
2834 #endif
2835
2836 for (size_t i = 0; i < bestLeftInsertPositions.size(); ++i)
2837 {
2838 int bestLeftInsertPosition = bestLeftInsertPositions[i];
2839 int bestRightInsertPosition = bestLeftInsertPosition;
2840
2841 if (bestRightInsertPositions.size() > i)
2842 bestRightInsertPosition = bestRightInsertPositions[i];
2843
2844 uint32_t left, right;
2845 if (dir == FUSION_FF || dir == FUSION_FR)
2846 left = leftHit.left() + bestLeftInsertPosition - 1;
2847 else
2848 left = leftHit.right() - bestLeftInsertPosition;
2849
2850 if (dir == FUSION_FF || dir == FUSION_RF)
2851 right = rightHit.right() - (read_length - bestRightInsertPosition);
2852 else
2853 right = rightHit.left() + (read_length - bestRightInsertPosition) - 1;
2854
2855 uint32_t ref_id1 = leftHit.ref_id();
2856 uint32_t ref_id2 = rightHit.ref_id();
2857
2858
2859 #if B_DEBUG
2860 cerr << endl << endl
2861 << "read id: " << leftHit.insert_id() << endl
2862 << "dir: " << dir << ", sense: " << (leftHit.antisense_align() ? "-" : "+") << endl
2863 << "left ref_id: " << rt.get_name(leftHit.ref_id()) << "\tright ref_id: " << rt.get_name(rightHit.ref_id()) << endl
2864 << read_sequence << endl
2865 << leftGenomicSequence << "\t" << rightGenomicSequence << endl
2866 << "insertion pos: " << bestLeftInsertPosition << endl;
2867
2868 if (bowtie2)
2869 {
2870 Dna5String left_sequence;
2871 if (dir == FUSION_FF || dir == FUSION_FR)
2872 left_sequence = seqan::infix(*left_ref_str, leftHit.left(), left + 1);
2873 else
2874 {
2875 left_sequence = seqan::infix(*left_ref_str, left, leftHit.right());
2876 seqan::reverseComplement(left_sequence);
2877 }
2878
2879 Dna5String right_sequence;
2880
2881 if (dir == FUSION_FF || dir == FUSION_RF)
2882 right_sequence = seqan::infix(*right_ref_str, right, rightHit.right());
2883 else
2884 {
2885 right_sequence = seqan::infix(*right_ref_str, rightHit.left(), right + 1);
2886 seqan::reverseComplement(right_sequence);
2887 }
2888
2889 cerr << "right insertion pos: " << bestRightInsertPosition << endl
2890 << left_sequence << "\t" << right_sequence << endl;
2891 }
2892
2893 cerr << left << ":" << right << endl
2894 << "errors: " << minErrors << endl;
2895 #endif
2896
2897 uint32_t temp_dir = dir;
2898 if ((ref_id2 < ref_id1) ||
2899 (ref_id1 == ref_id2 && left > right))
2900 {
2901 uint32_t temp = ref_id1;
2902 ref_id1 = ref_id2;
2903 ref_id2 = temp;
2904
2905 temp = left;
2906 left = right;
2907 right = temp;
2908
2909 if (dir == FUSION_FF)
2910 temp_dir = FUSION_RR;
2911 }
2912
2913 Fusion fusion(ref_id1, ref_id2, left, right, temp_dir);
2914 FusionSimpleSet::iterator itr = fusions.find(fusion);
2915 if (itr == fusions.end())
2916 {
2917 FusionSimpleStat simpleStat;
2918 simpleStat.count = 1;
2919 simpleStat.edit_dist = total_edit_dist;
2920 simpleStat.skip = false;
2921 simpleStat.left_coincide_with_splice_junction = false;
2922 simpleStat.right_coincide_with_splice_junction = false;
2923 fusions[fusion] = simpleStat;
2924 }
2925 else
2926 {
2927 itr->second.count += 1;
2928 itr->second.edit_dist = min(itr->second.edit_dist, total_edit_dist);
2929 }
2930 }
2931 }
2932
2933 void find_insertions_and_deletions(RefSequenceTable& rt,
2934 ReadStream& reads_file,
2935 vector<HitsForRead>& hits_for_read,
2936 std::set<Deletion>& deletions,
2937 std::set<Insertion>& insertions){
2938
2939 if(hits_for_read.empty()){
2940 return;
2941 }
2942
2943 size_t last_segment = hits_for_read.size()-1;
2944 size_t first_segment = 0;
2945 if(last_segment == first_segment){
2946 return;
2947 }
2948
2949 /*
2950 * We can check up front whether the first or last element is empty
2951 * and avoid doing any more work. Note that the following code requires
2952 * that there be at least one elment in each
2953 */
2954 if(hits_for_read[first_segment].hits.empty() || hits_for_read[last_segment].hits.empty()){
2955 return;
2956 }
2957
2958 /*
2959 * Need to identify the appropriate insert id for this group of reads
2960 */
2961 Read read;
2962 bool got_read = reads_file.getRead(hits_for_read.back().insert_id, read);
2963 if(!got_read){
2964 err_die("Error: could not get read# %d from stream!",
2965 (int)hits_for_read.back().insert_id);
2966 //return;
2967 }
2968
2969 /*
2970 * Work through all combinations of mappings for the first and last segment to see if any are indicative
2971 * of a small insertions or deletion
2972 */
2973 HitsForRead& left_segment_hits = hits_for_read[first_segment];
2974 HitsForRead& right_segment_hits = hits_for_read[last_segment];
2975
2976 /*
2977 * If either of the segment match lists is empty, we could try
2978 * to be smarter and work our way in until we find good a segment
2979 * match; however, won't do that for noe.
2980 */
2981 if(left_segment_hits.hits.empty() || right_segment_hits.hits.empty()){
2982 return;
2983 }
2984
2985 seqan::String<char> fullRead, rcRead;
2986 if(color){
2987 fullRead = read.seq.c_str() + 1;
2988 rcRead = fullRead;
2989 seqan::reverse(rcRead);
2990 }else{
2991 fullRead = read.seq;
2992 rcRead = read.seq;
2993 seqan::reverseComplement(rcRead);
2994 }
2995
2996 size_t read_length = seqan::length(fullRead);
2997 for(size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); left_segment_index++){
2998 for(size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); right_segment_index++){
2999 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
3000 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
3001 /*
3002 * Now we have found a pair of segment hits to investigate. Need to ensure
3003 * that
3004 * 1. the alignment orientation is consistent
3005 * 2. the distance separation is in the appropriate range
3006 * 3. Both hits are aligned to the same contig
3007 */
3008 if(leftHit->ref_id() != rightHit->ref_id()){
3009 continue;
3010 }
3011
3012 if(leftHit->antisense_align() != rightHit->antisense_align()){
3013 continue;
3014 }
3015
3016 seqan::String<char>* modifiedRead = &fullRead;
3017 /*
3018 * If we are dealing with an antisense alignment, then the left
3019 * read will actually be on the right, fix this now, to simplify
3020 * the rest of the logic, in addition, we will need to use the reverse
3021 * complement of the read sequence
3022 */
3023 if(leftHit->antisense_align()){
3024 BowtieHit * tmp = leftHit;
3025 leftHit = rightHit;
3026 rightHit = tmp;
3027 modifiedRead = &rcRead;
3028 }
3029
3030 size_t apparent_length = rightHit->right() - leftHit->left();
3031 int length_discrepancy = apparent_length - read_length;
3032 if(length_discrepancy > 0 && length_discrepancy <= (int)max_deletion_length){
3033 /*
3034 * Search for a deletion
3035 */
3036 detect_small_deletion(rt, *modifiedRead, *leftHit, *rightHit, deletions);
3037 }
3038 if(length_discrepancy < 0 && length_discrepancy >= -(int)max_insertion_length){
3039 /*
3040 * Search for an insertion
3041 */
3042 detect_small_insertion(rt, *modifiedRead, *leftHit, *rightHit, insertions);
3043 }
3044 }
3045 }
3046 }
3047
3048 /*
3049 */
3050 int map_read_to_contig(const String<char>& contig, const String<char>& read)
3051 {
3052 int contig_len = length(contig);
3053 int read_len = length(read);
3054 int pos = -1;
3055 int mismatch = 3;
3056
3057 for (int i = 0; i < contig_len - read_len; ++i)
3058 {
3059 int temp_mismatch = 0;
3060 for (int j = 0; j < read_len; ++j)
3061 {
3062 if (contig[i+j] != read[j])
3063 ++temp_mismatch;
3064
3065 if (temp_mismatch >= mismatch)
3066 break;
3067 }
3068
3069 if (temp_mismatch < mismatch)
3070 {
3071 pos = i;
3072 mismatch = temp_mismatch;
3073 }
3074 }
3075
3076 return pos;
3077 }
3078
3079
3080 void find_fusions(RefSequenceTable& rt,
3081 ReadStream& reads_file,
3082 vector<HitsForRead>& hits_for_read,
3083 HitStream& partner_hit_stream,
3084 HitStream& seg_partner_hit_stream,
3085 FusionSimpleSet& fusions,
3086 eREAD read_side)
3087 {
3088 if (hits_for_read.empty())
3089 return;
3090
3091 size_t last_segment = hits_for_read.size() - 1;
3092 while (last_segment > 0)
3093 {
3094 if (!hits_for_read[last_segment].hits.empty())
3095 break;
3096
3097 --last_segment;
3098 }
3099
3100 // daehwan
3101 #if 0
3102 if (last_segment == 0 || hits_for_read[0].hits.empty())
3103 {
3104 vector<BowtieHit>& hits = last_segment == 0 ? hits_for_read[0].hits : hits_for_read[1].hits;
3105
3106 for (size_t i = 0; i < hits.size(); ++i)
3107 {
3108 BowtieHit* hit = &hits[i];
3109
3110 static const uint32_t chr_id1 = rt.get_id("chr2");
3111 static const uint32_t chr_id2 = rt.get_id("chr3");
3112
3113 // KPL-4 PPP1R12A 12:80167343-80329235:-1 SEPT10 2:110300380-110371783:-1
3114 // const uint32_t left1 = 80167343, right1 = 80329235, left2 = 110300380, right2 = 110371783;
3115
3116 // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1
3117 const uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986;
3118
3119 if ((hit->ref_id() == chr_id1 && hit->left() >= left1 && hit->left() <= right1) ||
3120 (hit->ref_id() == chr_id2 && hit->left() >= left2 && hit->left() <= right2))
3121 {
3122 cout << hit->insert_id() << endl;
3123 break;
3124 #if 0
3125 cout << "insert id: " << hit->insert_id() << "\t num hits: " << hits.size() << endl
3126 << hit->ref_id() << ":" << (hit->antisense_align() ? "-" : "+") << " " << (int)hit->edit_dist() << endl
3127 << hit->left() << "-" << hit->right() << endl
3128 << hit->seq() << endl << endl;
3129 #endif
3130 }
3131 }
3132 }
3133 #endif
3134
3135
3136 size_t first_segment = 0;
3137
3138 if (last_segment == first_segment &&
3139 (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end()))
3140 return;
3141
3142 uint32_t insert_id = hits_for_read[last_segment].insert_id;
3143
3144 HitsForRead partner_hit_group;
3145 uint32_t next_order = partner_hit_stream.next_group_id();
3146
3147 bool has_partner = false;
3148 while (insert_id >= next_order && next_order != 0)
3149 {
3150 partner_hit_stream.next_read_hits(partner_hit_group);
3151 next_order = partner_hit_stream.next_group_id();
3152 }
3153
3154 has_partner = insert_id == partner_hit_group.insert_id;
3155
3156 if (!has_partner)
3157 {
3158 next_order = seg_partner_hit_stream.next_group_id();
3159 while (insert_id >= next_order && next_order != 0)
3160 {
3161 seg_partner_hit_stream.next_read_hits(partner_hit_group);
3162 next_order = seg_partner_hit_stream.next_group_id();
3163 }
3164
3165 has_partner = insert_id == partner_hit_group.insert_id;
3166 }
3167
3168 /*
3169 * Need to identify the appropriate insert id for this group of reads
3170 */
3171
3172
3173 Read read;
3174 bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read);
3175 if (!got_read)
3176 return;
3177
3178 HitsForRead partner_hits_for_read;
3179 if (first_segment != last_segment)
3180 partner_hits_for_read = hits_for_read[last_segment];
3181
3182 HitsForRead& left_segment_hits = hits_for_read[first_segment];
3183 HitsForRead& right_segment_hits = partner_hits_for_read;
3184
3185 seqan::String<char> fullRead, rcRead;
3186 fullRead = read.seq;
3187 rcRead = read.seq;
3188 seqan::reverseComplement(rcRead);
3189
3190 size_t read_length = seqan::length(fullRead);
3191
3192 bool check_partner = true;
3193 if (first_segment != last_segment)
3194 {
3195 for (size_t i = 0; i < left_segment_hits.hits.size(); ++i)
3196 {
3197 BowtieHit& leftHit = left_segment_hits.hits[i];
3198 for (size_t j = 0; j < right_segment_hits.hits.size(); ++j)
3199 {
3200 BowtieHit& rightHit = right_segment_hits.hits[j];
3201
3202 if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align())
3203 {
3204 int dist = 0;
3205 if (leftHit.antisense_align())
3206 dist = leftHit.left() - rightHit.right();
3207 else
3208 dist = rightHit.left() - leftHit.right();
3209
3210 if (dist > -max_insertion_length && dist <= (int)fusion_min_dist)
3211 {
3212 check_partner = false;
3213 break;
3214 }
3215 }
3216 }
3217
3218 if (!check_partner)
3219 break;
3220 }
3221 }
3222
3223 const int minus_dist = -(int)max_insertion_length * 2;
3224
3225 if (check_partner && has_partner)
3226 {
3227 for (size_t l = 0; l < left_segment_hits.hits.size(); ++l)
3228 {
3229 BowtieHit& leftHit = left_segment_hits.hits[l];
3230 for (size_t r = 0; r < partner_hit_group.hits.size(); ++r)
3231 {
3232 BowtieHit& rightHit = partner_hit_group.hits[r];
3233
3234 if (leftHit.ref_id() == rightHit.ref_id())
3235 {
3236 if (leftHit.antisense_align() != rightHit.antisense_align())
3237 {
3238 int dist = 0;
3239 if (leftHit.antisense_align())
3240 dist = leftHit.left() - rightHit.right();
3241 else
3242 dist = rightHit.left() - leftHit.right();
3243
3244 if (dist > minus_dist && dist <= (int)fusion_min_dist)
3245 continue;
3246 }
3247 }
3248
3249 RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id());
3250
3251 const size_t part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0;
3252 const size_t flanking_seq_len = inner_dist_mean + inner_dist_std_dev;
3253
3254 Dna5String right_flanking_seq;
3255 size_t left = 0;
3256 if (rightHit.antisense_align())
3257 {
3258 if (flanking_seq_len <= rightHit.left())
3259 {
3260 left = rightHit.left() - flanking_seq_len;
3261 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3262 }
3263 else
3264 break;
3265 }
3266 else
3267 {
3268 if (part_seq_len <= rightHit.right())
3269 {
3270 left = rightHit.right() - part_seq_len;
3271 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3272 }
3273 else
3274 break;
3275 }
3276
3277 const size_t check_read_len = min(15, segment_length - segment_mismatches - 3);
3278 seqan::String<char> fwd_read = infix(fullRead, read_length - check_read_len, read_length);
3279 seqan::String<char> rev_read = infix(rcRead, 0, check_read_len);
3280
3281 int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read);
3282
3283 if (fwd_pos >= 0)
3284 {
3285 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(),
3286 left + fwd_pos, check_read_len, false, 0, true);
3287
3288 right_segment_hits.hits.push_back(hit);
3289 }
3290
3291 int rev_pos = map_read_to_contig(right_flanking_seq, rev_read);
3292
3293 if (rev_pos >= 0)
3294 {
3295 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(),
3296 left + rev_pos, check_read_len, true, 0, true);
3297
3298 right_segment_hits.hits.push_back(hit);
3299 }
3300
3301 // daehwan
3302 #if 0
3303 if (fwd_pos >= 0 || rev_pos >= 0)
3304 {
3305 // if (leftHit.insert_id() == 409048 || leftHit.insert_id() == 4341516)
3306 {
3307 cout << "insert id: " << leftHit.insert_id() << endl
3308 << "fwd: " << fwd_read << " " << fwd_pos << endl
3309 << "rev: " << rev_read << " " << rev_pos << endl
3310 << "ref: " << right_flanking_seq << endl;
3311 }
3312 }
3313 #endif
3314 }
3315 }
3316 }
3317
3318 static std::set<RefID> ignore_chromosomes;
3319 if (ignore_chromosomes.size() <= 0 && fusion_ignore_chromosomes.size() > 0)
3320 {
3321 for (size_t i = 0; i < fusion_ignore_chromosomes.size(); ++i)
3322 ignore_chromosomes.insert(rt.get_id(fusion_ignore_chromosomes[i]));
3323 }
3324
3325 for (size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); ++left_segment_index)
3326 {
3327 for (size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); ++right_segment_index)
3328 {
3329 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
3330 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
3331
3332 if (ignore_chromosomes.find(leftHit->ref_id()) != ignore_chromosomes.end() ||
3333 ignore_chromosomes.find(rightHit->ref_id()) != ignore_chromosomes.end())
3334 continue;
3335
3336 if (bowtie2)
3337 {
3338 if (leftHit->edit_dist() + rightHit->edit_dist() > (segment_mismatches << 1))
3339 continue;
3340 }
3341
3342 // daehwan
3343 #if 0
3344 if (leftHit->ref_id() == rightHit->ref_id() && leftHit->ref_id() == 1119738090)
3345 {
3346 const uint32_t left1 = 113951556, right1 = 113977987, left2 = 113548692, right2 = 113754053;
3347 if ((leftHit->left() >= left1 && leftHit->left() <= right1 && rightHit->left() >= left2 && rightHit->left() <= right2) ||
3348 (leftHit->left() >= left2 && leftHit->left() <= right2 && rightHit->left() >= left1 && rightHit->left() <= right1))
3349 {
3350 cout << "insert id: " << leftHit->insert_id() << "\t num hits: " << left_segment_hits.hits.size() << ":" << right_segment_hits.hits.size() << endl
3351 << leftHit->ref_id() << ":" << (leftHit->antisense_align() ? "-" : "+") << " " << (int)leftHit->edit_dist() <<endl
3352 << leftHit->left() << "-" << leftHit->right() << endl
3353 << rightHit->ref_id() << ":" << (rightHit->antisense_align() ? "-" : "+") << " " << (int)rightHit->edit_dist() << endl
3354 << rightHit->left() << "-" << rightHit->right() << endl << endl;
3355 }
3356 }
3357 #endif
3358
3359 if (leftHit->ref_id() == rightHit->ref_id())
3360 {
3361 if (leftHit->antisense_align() == rightHit->antisense_align())
3362 {
3363 int dist = 0;
3364 if (leftHit->antisense_align())
3365 dist = leftHit->left() - rightHit->right();
3366 else
3367 dist = rightHit->left() - leftHit->right();
3368
3369 if (dist > minus_dist && dist <= (int)fusion_min_dist)
3370 continue;
3371 }
3372 }
3373
3374 uint32_t dir = FUSION_FF;
3375 seqan::String<char>* modifiedRead = &fullRead;
3376
3377 if (leftHit->antisense_align() == rightHit->antisense_align())
3378 {
3379 if (leftHit->antisense_align())
3380 {
3381 BowtieHit * tmp = leftHit;
3382 leftHit = rightHit;
3383 rightHit = tmp;
3384 modifiedRead = &rcRead;
3385 }
3386 }
3387 else if (leftHit->antisense_align() == false && rightHit->antisense_align() == true)
3388 dir = FUSION_FR;
3389 else
3390 dir = FUSION_RF;
3391
3392 detect_fusion(rt, *modifiedRead, *leftHit, *rightHit, fusions, dir);
3393 }
3394 }
3395 }
3396
3397 void find_gaps(RefSequenceTable& rt,
3398 ReadStream& reads_file,
3399 vector<HitsForRead>& hits_for_read,
3400 HitStream& partner_hit_stream,
3401 HitStream& seg_partner_hit_stream,
3402 std::set<Junction, skip_count_lt>& seg_juncs,
3403 eREAD read_side)
3404 {
3405 if (hits_for_read.empty())
3406 return;
3407
3408 size_t last_segment = hits_for_read.size() - 1;
3409 while (last_segment > 0)
3410 {
3411 if (!hits_for_read[last_segment].hits.empty())
3412 break;
3413
3414 --last_segment;
3415 }
3416
3417 hits_for_read.resize(last_segment + 1);
3418
3419 size_t first_segment = 0;
3420 if (last_segment == first_segment &&
3421 (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end()))
3422 return;
3423
3424 uint32_t insert_id = hits_for_read[last_segment].insert_id;
3425
3426 HitsForRead partner_hit_group;
3427 uint32_t next_order = partner_hit_stream.next_group_id();
3428
3429 bool has_partner = false;
3430 while (insert_id >= next_order && next_order != 0)
3431 {
3432 partner_hit_stream.next_read_hits(partner_hit_group);
3433 next_order = partner_hit_stream.next_group_id();
3434 }
3435
3436 has_partner = insert_id == partner_hit_group.insert_id;
3437
3438 if (!has_partner)
3439 {
3440 next_order = seg_partner_hit_stream.next_group_id();
3441 while (insert_id >= next_order && next_order != 0)
3442 {
3443 seg_partner_hit_stream.next_read_hits(partner_hit_group);
3444 next_order = seg_partner_hit_stream.next_group_id();
3445 }
3446
3447 has_partner = insert_id == partner_hit_group.insert_id;
3448 }
3449
3450 Read read;
3451 bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read);
3452 if (!got_read) {
3453 err_die("Error: could not get read# %d from stream!",
3454 (int)hits_for_read[last_segment].insert_id);
3455 return;
3456 }
3457
3458 HitsForRead partner_hits_for_read;
3459 if (first_segment != last_segment)
3460 partner_hits_for_read = hits_for_read[last_segment];
3461
3462 HitsForRead& left_segment_hits = hits_for_read[first_segment];
3463 HitsForRead& right_segment_hits = partner_hits_for_read;
3464
3465 bool check_partner = true;
3466 if (first_segment != last_segment)
3467 {
3468 for (size_t i = 0; i < left_segment_hits.hits.size(); ++i)
3469 {
3470 BowtieHit& leftHit = left_segment_hits.hits[i];
3471 for (size_t j = 0; j < right_segment_hits.hits.size(); ++j)
3472 {
3473 BowtieHit& rightHit = right_segment_hits.hits[j];
3474
3475 if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align())
3476 {
3477 int dist = 0;
3478 if (leftHit.antisense_align())
3479 dist = leftHit.left() - rightHit.right();
3480 else
3481 dist = rightHit.left() - leftHit.right();
3482
3483 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
3484 {
3485 check_partner = false;
3486 break;
3487 }
3488 }
3489 }
3490
3491 if (!check_partner)
3492 break;
3493 }
3494 }
3495
3496 if (check_partner && has_partner)
3497 {
3498 // empty hits from 1 to num_segments - 1
3499 for (size_t i = first_segment + 1; i < hits_for_read.size(); ++i)
3500 {
3501 hits_for_read[i].hits.clear();
3502 }
3503
3504 seqan::String<char> fullRead, rcRead;
3505 fullRead = read.seq;
3506 rcRead = read.seq;
3507 seqan::reverseComplement(rcRead);
3508 size_t read_length = read.seq.length();
3509
3510 for (size_t l = 0; l < left_segment_hits.hits.size(); ++l)
3511 {
3512 BowtieHit& leftHit = left_segment_hits.hits[l];
3513 for (size_t r = 0; r < partner_hit_group.hits.size(); ++r)
3514 {
3515 BowtieHit& rightHit = partner_hit_group.hits[r];
3516 if (leftHit.ref_id() != rightHit.ref_id() || leftHit.antisense_align() == rightHit.antisense_align())
3517 continue;
3518
3519 int dist = 0;
3520 if (leftHit.antisense_align())
3521 dist = leftHit.left() - rightHit.right();
3522 else
3523 dist = rightHit.left() - leftHit.right();
3524
3525 if (dist < min_segment_intron_length && dist >= (int)max_segment_intron_length)
3526 continue;
3527
3528 RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id());
3529 const size_t part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0;
3530 const size_t flanking_seq_len = inner_dist_mean + inner_dist_std_dev;
3531
3532 Dna5String right_flanking_seq;
3533 size_t left = 0;
3534 if (rightHit.antisense_align())
3535 {
3536 if (flanking_seq_len <= rightHit.left())
3537 {
3538 left = rightHit.left() - flanking_seq_len;
3539 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3540 }
3541 else
3542 break;
3543 }
3544 else
3545 {
3546 if (part_seq_len <= rightHit.right())
3547 {
3548 left = rightHit.right() - part_seq_len;
3549 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3550 }
3551 else
3552 break;
3553 }
3554
3555 const size_t check_read_len = min(15, segment_length - segment_mismatches - 3);
3556 seqan::String<char> fwd_read = infix(fullRead, read_length - check_read_len, read_length);
3557 seqan::String<char> rev_read = infix(rcRead, 0, check_read_len);
3558
3559 int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read);
3560 if (fwd_pos >= 0)
3561 {
3562 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(),
3563 left + fwd_pos, check_read_len, false, 0, true);
3564
3565 hits_for_read[last_segment].hits.push_back(hit);
3566 }
3567
3568 int rev_pos = map_read_to_contig(right_flanking_seq, rev_read);
3569
3570 if (rev_pos >= 0)
3571 {
3572 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(),
3573 left + rev_pos, check_read_len, true, 0, true);
3574
3575 hits_for_read[last_segment].hits.push_back(hit);
3576 }
3577
3578 // daehwan - for debug purposes
3579 #if B_DEBUG
3580 cerr << "daehwan!!!" << endl
3581 << "insert id: " << rightHit.insert_id() << endl
3582 << "first segment: " << first_segment << ", last_segment: " << last_segment << endl
3583 << right_flanking_seq << " : " << seqan::length(right_flanking_seq) << endl
3584 << fwd_read << " : " << fwd_pos << endl
3585 << rev_read << " : " << rev_pos << endl
3586 << "left: " << leftHit.left() << "-" << leftHit.right() << (leftHit.antisense_align() ? " -" : " +") << endl
3587 << "right: " << rightHit.left() << "-" << rightHit.right() << (rightHit.antisense_align() ? " -" : " +") << endl;
3588 if (fwd_pos >= 0 || rev_pos >= 0)
3589 {
3590 const BowtieHit& hit = hits_for_read[last_segment].hits.back();
3591 cerr << "back: " << hit.left() << "-" << hit.right() << (hit.antisense_align() ? " -" : " +") << endl;
3592 }
3593 #endif
3594 }
3595 }
3596 }
3597
3598 vector<RefSeg> expected_don_acc_windows;
3599 string seq(read.seq);
3600
3601 // ignore segments that map to more than this many places that would otherwise produce
3602 // many false splice junctions.
3603 if (bowtie2)
3604 {
3605 for (size_t s = 0; s < hits_for_read.size(); ++s)
3606 {
3607 if (hits_for_read[s].hits.size() > max_seg_multihits)
3608 return;
3609 }
3610 }
3611
3612 for (size_t s = 0; s < hits_for_read.size(); ++s)
3613 {
3614 HitsForRead& curr = hits_for_read[s];
3615 for (size_t h = 0; h < curr.hits.size(); ++h)
3616 {
3617 bool found_right_seg_partner = s == hits_for_read.size() - 1;
3618 BowtieHit& bh = curr.hits[h];
3619
3620 // "drs" is distant seg right partner
3621 // "rrs" is right of right seg partner
3622 vector<BowtieHit*> drs_bhs;
3623 vector<BowtieHit*> rrs_bhs;
3624
3625 if (s < hits_for_read.size() - 1)
3626 {
3627 // Look for a right partner for the current hit
3628 HitsForRead& right = hits_for_read[s + 1];
3629
3630 for (size_t r = 0; r < right.hits.size(); ++r)
3631 {
3632 BowtieHit& rh = right.hits[r];
3633 if (bh.antisense_align() != rh.antisense_align() || bh.ref_id() != rh.ref_id())
3634 continue;
3635
3636 if ((bh.antisense_align() && rh.right() == bh.left()) ||
3637 (!bh.antisense_align() && bh.right() == rh.left() ))
3638 {
3639 found_right_seg_partner = true;
3640 break;
3641 }
3642
3643 int dist = 0;
3644 if (bh.antisense_align())
3645 dist = bh.left() - rh.right();
3646 else
3647 dist = rh.left() - bh.right();
3648
3649 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
3650 drs_bhs.push_back(&rh);
3651 }
3652 }
3653
3654 if (!found_right_seg_partner && s < hits_for_read.size() - 2)
3655 {
3656 // Look for a right of right partner for the current hit
3657 HitsForRead& right_right = hits_for_read[s + 2];
3658
3659 for (size_t r = 0; r < right_right.hits.size(); ++r)
3660 {
3661 BowtieHit& rrh = right_right.hits[r];
3662 if (bh.antisense_align() != rrh.antisense_align() || bh.ref_id() != rrh.ref_id())
3663 continue;
3664
3665 int dist = 0;
3666 if (bh.antisense_align())
3667 dist = bh.left() - rrh.right();
3668 else
3669 dist = rrh.left() - bh.right();
3670
3671 if (dist >= min_segment_intron_length + segment_length && dist < (int)max_segment_intron_length + segment_length)
3672 rrs_bhs.push_back(&rrh);
3673 }
3674 }
3675
3676 if (!found_right_seg_partner && (drs_bhs.size() > 0 || rrs_bhs.size() > 0))
3677 {
3678 const int look_bp = 8;
3679 const size_t color_offset = color ? 1 : 0;
3680
3681 vector<BowtieHit*> d_bhs = rrs_bhs.size() > 0 ? rrs_bhs : drs_bhs;
3682 for (size_t r = 0; r < d_bhs.size(); ++r)
3683 {
3684 string support_read;
3685 if (rrs_bhs.size() <= 0)
3686 support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, look_bp * 2);
3687 else
3688 support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, segment_length + look_bp * 2);
3689
3690 BowtieHit& d_bh = *(d_bhs[r]);
3691 if (!bh.antisense_align())
3692 {
3693 RefSeg right_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read);
3694 right_seg.left = max(0, bh.right() - look_bp);
3695 right_seg.right = d_bh.left() + look_bp;
3696 expected_don_acc_windows.push_back(right_seg);
3697 }
3698 else
3699 {
3700 if (color)
3701 reverse(support_read.begin(), support_read.end());
3702 else
3703 reverse_complement(support_read);
3704
3705 RefSeg left_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read);
3706 left_seg.left = d_bh.right() - look_bp;
3707 left_seg.right = bh.left() + look_bp;
3708 expected_don_acc_windows.push_back(left_seg);
3709 }
3710
3711 // daehwan
3712 #ifdef B_DEBUG2
3713 cout << "insert id: " << bh.insert_id() << endl
3714 << (bh.antisense_align() ? "-" : "+") << endl
3715 << seq << endl
3716 << "(" << s << ") - " << support_read << endl;
3717 #endif
3718 }
3719 }
3720 }
3721 } //for each hits_for_read
3722
3723 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3724 expected_don_acc_windows,
3725 seg_juncs,
3726 "GT",
3727 "AG",
3728 max_segment_intron_length,
3729 min_segment_intron_length,
3730 max_seg_juncs,
3731 false,
3732 0);
3733
3734 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3735 expected_don_acc_windows,
3736 seg_juncs,
3737 "GC",
3738 "AG",
3739 max_segment_intron_length,
3740 min_segment_intron_length,
3741 max_seg_juncs,
3742 false,
3743 0);
3744
3745 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3746 expected_don_acc_windows,
3747 seg_juncs,
3748 "AT",
3749 "AC",
3750 max_segment_intron_length,
3751 min_segment_intron_length,
3752 max_seg_juncs,
3753 false,
3754 0);
3755 }
3756
3757 MerTable mer_table;
3758
3759 int seed_alignments = 0;
3760 int microaligned_segs = 0;
3761
3762 map<RefSeg, vector<string>* > microexon_windows;
3763
3764 bool overlap_in_genome(int ll, int lr, int rl, int rr)
3765 {
3766 if (ll >= rl && ll < rr)
3767 return true;
3768 if (lr > rl && lr < rr)
3769 return true;
3770 if (rl >= ll && rl < lr)
3771 return true;
3772 if (rr > ll && rr < lr)
3773 return true;
3774 return false;
3775 }
3776
3777 void add_to_microexon_windows(uint32_t ref_id,
3778 int left_boundary,
3779 int right_boundary,
3780 const string& dna_str,
3781 eREAD read)
3782 {
3783 RefSeg left_dummy(ref_id, POINT_DIR_DONTCARE, false, read, left_boundary, right_boundary);
3784 RefSeg right_dummy(ref_id, POINT_DIR_DONTCARE, false, read, right_boundary, right_boundary + 1);
3785
3786 map<RefSeg, vector<string>* >::iterator lb = microexon_windows.lower_bound(left_dummy);
3787 map<RefSeg, vector<string>* >::iterator ub = microexon_windows.lower_bound(right_dummy);
3788 vector<string>* new_vec = NULL;
3789 if (lb == microexon_windows.end())
3790 {
3791 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3792 return;
3793 }
3794
3795 map<RefSeg, vector<string>* >::iterator first_to_be_erased = microexon_windows.end();
3796 map<RefSeg, vector<string>* >::iterator last_to_be_erased = ub;
3797
3798 while (lb != ub)
3799 {
3800 // everyone in this range that overlaps with the new interval needs to
3801 // be merged together.
3802 if (overlap_in_genome(lb->first.left, lb->first.right, left_boundary, right_boundary))
3803 {
3804 if (!new_vec)
3805 new_vec = new vector<string>();
3806 if (first_to_be_erased == microexon_windows.end())
3807 first_to_be_erased = lb;
3808 left_dummy.left = min(lb->first.left, left_boundary);
3809 left_dummy.right = max(lb->first.right, right_boundary);
3810
3811 new_vec->insert(new_vec->end(), lb->second->begin(), lb->second->end());
3812 delete lb->second;
3813 }
3814 else if (first_to_be_erased != microexon_windows.end())
3815 {
3816 last_to_be_erased = lb;
3817 }
3818
3819 ++lb;
3820 }
3821
3822 if (first_to_be_erased != microexon_windows.end())
3823 {
3824 microexon_windows.erase(first_to_be_erased, last_to_be_erased);
3825 }
3826
3827 if (!new_vec)
3828 {
3829 // never found an overlapping window, so just add this one and bail
3830 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3831 return;
3832 }
3833 else
3834 {
3835 new_vec->push_back(dna_str);
3836 microexon_windows.insert(make_pair(left_dummy, new_vec));
3837 return;
3838 }
3839
3840 }
3841
3842 void align_microexon_segs(RefSequenceTable& rt,
3843 std::set<Junction, skip_count_lt>& juncs,
3844 int max_juncs,
3845 int half_splice_mer_len)
3846 {
3847 int num_segments = 0;
3848 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3849 itr != microexon_windows.end(); ++itr)
3850 {
3851 vector<string>& unaligned_segments = *itr->second;
3852 num_segments += unaligned_segments.size();
3853 }
3854
3855 fprintf(stderr, "Aligning %d microexon segments in %lu windows\n",
3856 num_segments, (long unsigned int)microexon_windows.size());
3857
3858 extensions.clear();
3859
3860 size_t splice_mer_len = 2 * half_splice_mer_len;
3861 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
3862
3863 extensions.resize(mer_table_size);
3864
3865 int window_num = 0;
3866 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3867 itr != microexon_windows.end(); ++itr)
3868 {
3869 window_num++;
3870 if ((window_num % 100) == 0)
3871 fprintf(stderr, "\twindow %d\n",window_num);
3872
3873
3874 stringstream ss(stringstream::in | stringstream::out);
3875
3876 for (size_t j = 0; j < extensions.size(); ++j)
3877 {
3878 extensions[j].clear();
3879 }
3880
3881 vector<string>& unaligned_segments = *itr->second;
3882
3883 for (size_t j = 0; j < unaligned_segments.size(); ++j)
3884 {
3885 stringstream ss(stringstream::in | stringstream::out);
3886 string s;
3887 //cerr << w.unaligned_segments[j];
3888 ss << unaligned_segments[j];
3889 ss >> s;
3890
3891 store_read_extensions(extensions,
3892 half_splice_mer_len,
3893 half_splice_mer_len,
3894 s,
3895 false);
3896 }
3897
3898 vector<RefSeg> segs;
3899 segs.push_back(itr->first);
3900 RefSeg r = itr->first;
3901 r.points_where = POINT_DIR_LEFT;
3902 segs.push_back(r);
3903
3904 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
3905 segs,
3906 juncs,
3907 "GT",
3908 "AG",
3909 max_microexon_stretch,
3910 min_coverage_intron_length,
3911 max_juncs,
3912 false,
3913 half_splice_mer_len);
3914 num_segments += unaligned_segments.size();
3915 delete itr->second;
3916 }
3917 fprintf(stderr, "Checked %d segments against %lu windows for microexon junctions\n",
3918 num_segments, (long unsigned int)microexon_windows.size());
3919 fprintf(stderr, "Found %ld potential microexon junctions\n", (long int)juncs.size());
3920 }
3921
3922 /*
3923 * Easy guys ... this function puts its pants on just like the rest of you -- one leg
3924 * at a time. Except, once its pants are on, it makes gold records. Alright, here we
3925 * go.
3926 */
3927
3928 void look_for_hit_group(RefSequenceTable& rt,
3929 ReadStream& readstream,
3930 ReadStream& readstream_for_segment_search,
3931 ReadStream& readstream_for_indel_discovery,
3932 ReadStream& readstream_for_fusion_discovery,
3933 ReadTable& unmapped_reads,
3934 vector<HitStream>& seg_files,
3935 int curr_file,
3936 uint64_t insert_id,
3937 vector<HitsForRead>& hits_for_read, //<-- collecting segment hits for this read
3938 HitStream& partner_hit_stream_for_segment_search,
3939 HitStream& seg_partner_hit_stream_for_segment_search,
3940 HitStream& partner_hit_stream_for_fusion_discovery,
3941 HitStream& seg_partner_hit_stream_for_fusion_discovery,
3942 std::set<Junction, skip_count_lt>& juncs,
3943 std::set<Deletion>& deletions,
3944 std::set<Insertion>& insertions,
3945 FusionSimpleSet& fusions,
3946 eREAD read_side,
3947 uint32_t begin_id,
3948 uint32_t end_id)
3949 {
3950 HitStream& hit_stream = seg_files[curr_file];
3951 HitsForRead hit_group;
3952 uint32_t order = unmapped_reads.observation_order(insert_id);
3953 int seq_key_len = min((int)min_anchor_len, 6);
3954 while(true) {
3955 uint64_t next_group_id = hit_stream.next_group_id();
3956 uint32_t next_order = unmapped_reads.observation_order(next_group_id);
3957 // If we would have seen the hits by now, stop looking in this stream,
3958 // but forward the search to the next (lower) segment if possible.
3959 if (order < next_order || next_group_id == 0) {
3960 if (curr_file > 0) {
3961 //look for next (lower) segment mappings for this read
3962 look_for_hit_group(rt,
3963 readstream,
3964 readstream_for_segment_search,
3965 readstream_for_indel_discovery,
3966 readstream_for_fusion_discovery,
3967 unmapped_reads,
3968 seg_files,
3969 curr_file - 1,
3970 insert_id,
3971 hits_for_read,
3972 partner_hit_stream_for_segment_search,
3973 seg_partner_hit_stream_for_segment_search,
3974 partner_hit_stream_for_fusion_discovery,
3975 seg_partner_hit_stream_for_fusion_discovery,
3976 juncs,
3977 deletions,
3978 insertions,
3979 fusions,
3980 read_side,
3981 begin_id,
3982 end_id);
3983 }
3984 else
3985 if (insert_id && !no_microexon_search) {
3986 //microexon search
3987 Read read;
3988 // The hits are missing for the leftmost segment, which means
3989 // we should try looking for junctions via seed and extend
3990 // using it (helps find junctions to microexons).
3991 bool got_read = readstream.getRead(insert_id, read);
3992 if (!got_read) {
3993 //fprintf(stderr, "Warning: could not get read with insert_id=%d\n", (int)insert_id);
3994 //break; //exit loop
3995 err_die("Error: could not get read with insert_id=%d from file %s\n",
3996 (int)insert_id, readstream.filename());
3997 }
3998 string fwd_read = read.seq;
3999 if (color) // remove the primer and the adjacent color
4000 fwd_read.erase(0, 2);
4001 // make sure there are hits for all the other segs, all the
4002 // way to the root (right-most) one.
4003 int empty_seg = 0;
4004 for (size_t h = 1; h < hits_for_read.size(); ++h) {
4005 if (hits_for_read[h].hits.empty())
4006 empty_seg = h;
4007 }
4008 // if not, no microexon alignment for this segment
4009 if (empty_seg != 0)
4010 break;
4011 fwd_read = fwd_read.substr(0, segment_length);
4012 string rev_read = fwd_read;
4013 //check the reverse
4014 if (color) reverse(rev_read.begin(), rev_read.end());
4015 else reverse_complement(rev_read);
4016 for (size_t h = 0; h < hits_for_read[empty_seg + 1].hits.size(); ++h) {
4017 const BowtieHit& bh = hits_for_read[empty_seg + 1].hits[h];
4018 RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id());
4019 if (ref_str == NULL)
4020 continue;
4021 int ref_len = length(*ref_str);
4022 int left_boundary;
4023 int right_boundary;
4024 bool antisense = bh.antisense_align();
4025 vector<BowtieHit> empty_seg_hits;
4026 seed_alignments++;
4027 if (antisense) {
4028 left_boundary = max(0, bh.right() - (int)min_anchor_len);
4029 right_boundary = min(ref_len - 2,
4030 left_boundary + max_microexon_stretch);
4031 if (right_boundary - left_boundary < 2 * seq_key_len)
4032 continue;
4033 microaligned_segs++;
4034 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, rev_read, read_side);
4035 }
4036 else {
4037 right_boundary = min(ref_len - 2,
4038 bh.left() + (int)min_anchor_len);
4039 left_boundary = max(0, right_boundary - max_microexon_stretch);
4040 if (right_boundary - left_boundary < 2 * seq_key_len)
4041 continue;
4042 microaligned_segs++;
4043 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, fwd_read, read_side);
4044 }
4045 } //for h
4046 } // !no_microexon_search
4047 break;
4048 }
4049 else if (hit_stream.next_read_hits(hit_group)) {
4050 // if we found hits for the target group in the left stream,
4051 // add them to the accumulating vector and continue the search
4052 if (hit_group.insert_id == insert_id) {
4053 hits_for_read[curr_file] = hit_group;
4054 if (curr_file > 0)
4055 // we need to look left (recursively) for the group we
4056 // just read for this stream.
4057 look_for_hit_group(rt,
4058 readstream,
4059 readstream_for_segment_search,
4060 readstream_for_indel_discovery,
4061 readstream_for_fusion_discovery,
4062 unmapped_reads,
4063 seg_files,
4064 curr_file - 1,
4065 insert_id,
4066 hits_for_read,
4067 partner_hit_stream_for_segment_search,
4068 seg_partner_hit_stream_for_segment_search,
4069 partner_hit_stream_for_fusion_discovery,
4070 seg_partner_hit_stream_for_fusion_discovery,
4071 juncs,
4072 deletions,
4073 insertions,
4074 fusions,
4075 read_side,
4076 begin_id,
4077 end_id);
4078 break;
4079 } //same insert_id (group)
4080 else if (curr_file >= 0) {
4081 // different group, we need to start a whole new
4082 // search for it, with a whole new vector of hits.
4083 vector<HitsForRead> hits_for_new_read(seg_files.size());
4084 hits_for_new_read[curr_file] = hit_group;
4085
4086 if (curr_file > 0)
4087 {
4088 look_for_hit_group(rt,
4089 readstream,
4090 readstream_for_segment_search,
4091 readstream_for_indel_discovery,
4092 readstream_for_fusion_discovery,
4093 unmapped_reads,
4094 seg_files,
4095 curr_file - 1,
4096 hit_group.insert_id,
4097 hits_for_new_read,
4098 partner_hit_stream_for_segment_search,
4099 seg_partner_hit_stream_for_segment_search,
4100 partner_hit_stream_for_fusion_discovery,
4101 seg_partner_hit_stream_for_fusion_discovery,
4102 juncs,
4103 deletions,
4104 insertions,
4105 fusions,
4106 read_side,
4107 begin_id,
4108 end_id);
4109
4110 if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id)
4111 {
4112 find_insertions_and_deletions(rt,
4113 readstream_for_indel_discovery,
4114 hits_for_new_read,
4115 deletions,
4116 insertions);
4117 find_gaps(rt,
4118 readstream_for_segment_search,
4119 hits_for_new_read,
4120 partner_hit_stream_for_segment_search,
4121 seg_partner_hit_stream_for_segment_search,
4122 juncs,
4123 read_side);
4124 }
4125 }
4126
4127 if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id)
4128 {
4129 if (fusion_search)
4130 {
4131 find_fusions(rt,
4132 readstream_for_fusion_discovery,
4133 hits_for_new_read,
4134 partner_hit_stream_for_fusion_discovery,
4135 seg_partner_hit_stream_for_fusion_discovery,
4136 fusions,
4137 read_side);
4138 }
4139 }
4140 } //different group
4141 }//got next group
4142 } //while loop
4143 }
4144
4145
4146 uint64_t process_next_hit_group(RefSequenceTable& rt,
4147 ReadStream& readstream,
4148 ReadStream& readstream_for_segment_search,
4149 ReadStream& readstream_for_indel_discovery,
4150 ReadStream& readstream_for_fusion_discovery,
4151 ReadTable& unmapped_reads,
4152 vector<HitStream>& seg_files,
4153 size_t last_file_idx,
4154 HitStream& partner_hit_stream_for_segment_search,
4155 HitStream& seg_partner_hit_stream_for_segment_search,
4156 HitStream& partner_hit_stream_for_fusion_discovery,
4157 HitStream& seg_partner_hit_stream_for_fusion_discovery,
4158 std::set<Junction, skip_count_lt>& juncs,
4159 std::set<Deletion>& deletions,
4160 std::set<Insertion>& insertions,
4161 FusionSimpleSet& fusions,
4162 eREAD read,
4163 uint32_t begin_id = 0,
4164 uint32_t end_id = VMAXINT32)
4165 {
4166 HitStream& last_segmap_hitstream = seg_files[last_file_idx];
4167 HitsForRead hit_group;
4168 bool result = last_segmap_hitstream.next_read_hits(hit_group);
4169 vector<HitsForRead> hits_for_read(seg_files.size());
4170 hits_for_read.back() = hit_group;
4171
4172 if (result && hit_group.insert_id >= end_id)
4173 return 0;
4174
4175 look_for_hit_group(rt,
4176 readstream,
4177 readstream_for_segment_search,
4178 readstream_for_indel_discovery,
4179 readstream_for_fusion_discovery,
4180 unmapped_reads,
4181 seg_files,
4182 (int)last_file_idx - 1,
4183 hit_group.insert_id,
4184 hits_for_read,
4185 partner_hit_stream_for_segment_search,
4186 seg_partner_hit_stream_for_segment_search,
4187 partner_hit_stream_for_fusion_discovery,
4188 seg_partner_hit_stream_for_fusion_discovery,
4189 juncs,
4190 deletions,
4191 insertions,
4192 fusions,
4193 read,
4194 begin_id,
4195 end_id);
4196
4197 if (result)
4198 {
4199 find_insertions_and_deletions(rt,
4200 readstream_for_indel_discovery,
4201 hits_for_read,
4202 deletions,
4203 insertions);
4204
4205 if (fusion_search)
4206 {
4207 find_fusions(rt,
4208 readstream_for_fusion_discovery,
4209 hits_for_read,
4210 partner_hit_stream_for_fusion_discovery,
4211 seg_partner_hit_stream_for_fusion_discovery,
4212 fusions,
4213 read);
4214 }
4215
4216 find_gaps(rt,
4217 readstream_for_segment_search,
4218 hits_for_read,
4219 partner_hit_stream_for_segment_search,
4220 seg_partner_hit_stream_for_segment_search,
4221 juncs,
4222 read);
4223
4224 return hit_group.insert_id;
4225 }
4226
4227 return 0;
4228 }
4229
4230 static const int UNCOVERED = 0;
4231 static const int LOOK_LEFT = 1;
4232 static const int LOOK_RIGHT = 2;
4233
4234 uint8_t get_cov(const vector<uint8_t>& cov, uint32_t c)
4235 {
4236 uint32_t b = c >> 2;
4237 uint32_t r = c & 0x3;
4238 uint8_t s = (r << 1);
4239 uint8_t v = cov[b];
4240 v &= (0x3 << s);
4241 v >>= s;
4242 return v;
4243 }
4244
4245 void build_coverage_map(ReadTable& it, RefSequenceTable& rt,
4246 vector<string>& seg_files, map<uint32_t, vector<bool> >& coverage_map) {
4247 if (!coverage_map.empty()) return;
4248 BAMHitFactory hit_factory(it,rt);
4249
4250 for (size_t f = 0; f < seg_files.size(); ++f)
4251 {
4252 //fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
4253 //seg_files[f]->rewind();
4254 HitStream hs(seg_files[f], &hit_factory, false, false, false);
4255 HitsForRead hit_group;
4256 while (hs.next_read_hits(hit_group))
4257 {
4258 for (size_t h = 0; h < hit_group.hits.size(); ++h)
4259 {
4260 BowtieHit& bh = hit_group.hits[h];
4261
4262 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
4263 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
4264 vector<bool>& ref_cov = ret.first->second;
4265
4266 size_t right_extent = bh.right();
4267
4268 if (right_extent >= ref_cov.size())
4269 {
4270 ref_cov.resize(right_extent + 1, 0);
4271 }
4272
4273 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
4274 {
4275 ref_cov[c] = true;
4276 //if (ref_cov[c]<255) ref_cov[c]++;
4277 }
4278 }
4279 } //while next_read_hits
4280 }
4281 }
4282
4283 void pair_covered_sites(ReadTable& it,
4284 RefSequenceTable& rt,
4285 vector<string>& segmap_fnames,
4286 std::set<Junction, skip_count_lt>& cov_juncs,
4287 map<uint32_t, vector<bool> >& coverage_map,
4288 size_t half_splice_mer_len)
4289 {
4290 vector<RefSeg> expected_look_left_windows;
4291 vector<RefSeg> expected_look_right_windows;
4292 build_coverage_map(it,rt, segmap_fnames, coverage_map);
4293
4294 static const int extend = 45;
4295 int num_islands = 0;
4296
4297 vector<RefSeg> expected_don_acc_windows;
4298
4299 fprintf(stderr, "Recording coverage islands\n");
4300 size_t cov_bases = 0;
4301 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
4302 itr != coverage_map.end();
4303 ++itr)
4304 {
4305 vector<bool>& cov = itr->second;
4306
4307 size_t island_left_edge = 0;
4308 for (size_t c = 1; c < cov.size(); ++c)
4309 {
4310 if (cov[c])
4311 {
4312 cov_bases++;
4313 if (!cov[c - 1])
4314 {
4315 num_islands += 1;
4316 int edge = (int)c - extend;
4317 edge = max(edge, 0);
4318 island_left_edge = edge;
4319 }
4320 }
4321 else
4322 {
4323 if (cov[c - 1])
4324 {
4325 expected_don_acc_windows.push_back(RefSeg(itr->first,
4326 POINT_DIR_LEFT,
4327 false, /* not important */
4328 READ_DONTCARE,
4329 island_left_edge,
4330 c + extend));
4331 expected_don_acc_windows.push_back(RefSeg(itr->first,
4332 POINT_DIR_RIGHT,
4333 false, /* not important */
4334 READ_DONTCARE,
4335 island_left_edge,
4336 c + extend));
4337 }
4338 }
4339 }
4340 }
4341 fprintf(stderr, "Found %d islands covering %ld bases\n", num_islands, (long int)cov_bases);
4342
4343 juncs_from_ref_segs<RecordButterflyJuncs>(rt,
4344 expected_don_acc_windows,
4345 cov_juncs,
4346 "GT",
4347 "AG",
4348 max_coverage_intron_length,
4349 min_coverage_intron_length,
4350 max_cov_juncs,
4351 true,
4352 half_splice_mer_len);
4353 fprintf(stderr, "Found %ld potential intra-island junctions\n", (long int)cov_juncs.size());
4354 }
4355
4356 // daehwan
4357 struct ReadInfo
4358 {
4359 ReadID read_id;
4360 uint32_t left;
4361 uint32_t right;
4362
4363 bool operator<(const ReadInfo& rhs) const
4364 {
4365 if (left != rhs.left)
4366 return left < rhs.left;
4367 if (right != rhs.right)
4368 return right < rhs.right;
4369
4370 return false;
4371 }
4372 };
4373
4374 void capture_island_ends(ReadTable& it,
4375 RefSequenceTable& rt,
4376 vector<string>& segmap_fnames,
4377 std::set<Junction, skip_count_lt>& cov_juncs,
4378 map<uint32_t, vector<bool> >& coverage_map,
4379 size_t half_splice_mer_len)
4380 {
4381 //static int island_repeat_tolerance = 10;
4382 vector<RefSeg> expected_look_left_windows;
4383 vector<RefSeg> expected_look_right_windows;
4384
4385 // daehwan
4386 //#define DEBUG_CHECK_EXONS 1
4387 //#define DEBUG_RANGE_ONLY 1
4388
4389 #ifndef DEBUG_CHECK_EXONS
4390 build_coverage_map(it, rt, segmap_fnames, coverage_map);
4391 #else
4392 //build coverage map here, so we can debug it
4393 #ifdef DEBUG_RANGE_ONLY
4394 static const uint32_t chr14_id = rt.get_id("chr14");
4395 #endif
4396 vector<ReadInfo> hits;
4397 BAMHitFactory hit_factory(it,rt);
4398
4399 for (size_t f = 0; f < seg_files.size(); ++f)
4400 {
4401 fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
4402 seg_files[f]->rewind();
4403 FILE* fp = seg_files[f]->file;
4404 //rewind(fp);
4405 HitStream hs(fp, &hit_factory, false, false, false);
4406
4407 HitsForRead hit_group;
4408 while (hs.next_read_hits(hit_group))
4409 {
4410 for (size_t h = 0; h < hit_group.hits.size(); ++h)
4411 {
4412 BowtieHit& bh = hit_group.hits[h];
4413 // daehwan
4414 //if (check_exons) <-- DEBUG_CHECK_EXONS
4415 #ifdef DEBUG_RANGE_ONLY
4416 if (bh.ref_id() != chr14_id)
4417 continue;
4418
4419 // if (bh.left() < 66567028 && bh.right() > 66604392)
4420 if (bh.left() < 66400000 || bh.right() > 66700000)
4421 continue;
4422
4423 ReadInfo read_info;
4424 read_info.read_id = bh.insert_id();
4425 read_info.left = bh.left();
4426 read_info.right = bh.right();
4427 hits.push_back(read_info);
4428 #endif
4429
4430 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
4431 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
4432 vector<bool>& ref_cov = ret.first->second;
4433
4434 size_t right_extent = bh.right();
4435
4436 if (right_extent >= ref_cov.size())
4437 {
4438 ref_cov.resize(right_extent + 1, 0);
4439 }
4440 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
4441 {
4442 ref_cov[c] = true;
4443 }
4444 }
4445 }
4446 }
4447
4448 sort(hits.begin(), hits.end());
4449 #endif
4450 // static const int min_cov_length = segment_length + 2;
4451 long covered_bases = 0;
4452 int long_enough_bases = 0;
4453 int left_looking = 0;
4454 int right_looking = 0;
4455 static const int extend = 45;
4456 static const int repeat_tol = 5;
4457
4458 int num_islands = 0;
4459
4460 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
4461 itr != coverage_map.end();
4462 ++itr)
4463 {
4464 #ifdef B_DEBUG
4465 fprintf (stderr, "Finding pairings in ref seq %s\n", rt.get_name(itr->first));
4466 #endif
4467 vector<bool>& cov = itr->second;
4468 vector<bool> long_enough(cov.size());
4469
4470 size_t last_uncovered = 0;
4471
4472 static const uint8_t min_cov = 1;
4473
4474 for (size_t c = 1; c < cov.size(); ++c)
4475 {
4476 uint8_t c_cov = cov[c]; //get_cov(cov, c);
4477 if (c_cov < min_cov || c == (cov.size()) - 1)
4478 {
4479 int putative_exon_length = (int)c - (int)last_uncovered;
4480 uint32_t last_pos_cov = cov[c - 1]; //get_cov(cov,c - 1);
4481 if (last_pos_cov >= min_cov && putative_exon_length >= min_cov_length)
4482 {
4483 #ifdef B_DEBUG
4484 fprintf(stderr, "cov. island: %d-%d\n", (int)(last_uncovered + 1), (int)c);
4485 fprintf(stderr, "\t(putative exon length = %d, min_cov_length=%d)\n",putative_exon_length, min_cov_length);
4486 #endif
4487 covered_bases += (c + 1 - last_uncovered);
4488 for (int l = (int)c; l > (int)last_uncovered; --l)
4489 {
4490 long_enough[l] = true;
4491 }
4492 }
4493 last_uncovered = c;
4494 }
4495 }
4496 vector<bool>& ref_cov = long_enough;
4497 vector<uint8_t> cov_state(ref_cov.size(), UNCOVERED);
4498
4499 // daehwan - print islands (exons)
4500 //if (check_exons)
4501 #ifdef DEBUG_CHECK_EXONS
4502 //{
4503 uint32_t left = 0, right = 0;
4504 for (size_t c = 1; c < ref_cov.size(); ++c)
4505 {
4506 if (ref_cov[c] && !ref_cov[c-1])
4507 {
4508 left = c;
4509 cout << "Exon: " << left << "-";
4510 }
4511 else if (!ref_cov[c] && ref_cov[c-1])
4512 {
4513 right = c - 1;
4514 cout << right << endl;
4515 for (size_t k = 0; k < hits.size(); ++k)
4516 {
4517 const ReadInfo& hit = hits[k];
4518 if (hit.right < left)
4519 continue;
4520
4521 if (hit.left > right)
4522 break;
4523
4524 cout << "\t" << hit.read_id << " " << hit.left << "-" << hit.right << endl;
4525 }
4526 }
4527 }
4528 //}
4529 #endif
4530 for (size_t c = 1; c < ref_cov.size(); ++c)
4531 {
4532 if (ref_cov[c])
4533 {
4534 long_enough_bases++;
4535 if (!ref_cov[c - 1])
4536 {
4537 num_islands += 1;
4538
4539 for (int r = (int)c - extend;
4540 r >= 0 && r < (int)c + repeat_tol && r < (int)cov_state.size();
4541 ++r)
4542 {
4543 cov_state[r] |= LOOK_LEFT;
4544 }
4545 }
4546 }
4547 else
4548 {
4549 if (ref_cov[c - 1])
4550 {
4551 for (int l = (int)c - repeat_tol;
4552 l >= 0 && l < (int)c + extend && l < (int)cov_state.size();
4553 ++l)
4554 {
4555 cov_state[l] |= LOOK_RIGHT;
4556 }
4557 }
4558 }
4559 }
4560
4561 RefSeg* curr_look_left = NULL;
4562 RefSeg* curr_look_right = NULL;
4563
4564 for (size_t c = 1; c < cov_state.size(); ++c)
4565 {
4566 if (cov_state[c] & LOOK_LEFT)
4567 {
4568 left_looking++;
4569 if (!(cov_state[c-1] & LOOK_LEFT))
4570 {
4571 expected_look_left_windows.push_back(RefSeg(itr->first,
4572 POINT_DIR_LEFT,
4573 false, /* not important */
4574 READ_DONTCARE,
4575 c,
4576 c + 1));
4577 curr_look_left = &(expected_look_left_windows.back());
4578 }
4579 else if (curr_look_left)
4580 {
4581 curr_look_left->right++;
4582 }
4583 }
4584 else
4585 {
4586 if ((cov_state[c-1] & LOOK_LEFT))
4587 {
4588 curr_look_left = NULL;
4589 }
4590 }
4591
4592 if (cov_state[c] & LOOK_RIGHT)
4593 {
4594 right_looking++;
4595 if (!(cov_state[c-1] & LOOK_RIGHT))
4596 {
4597 expected_look_right_windows.push_back(RefSeg(itr->first,
4598 POINT_DIR_RIGHT,
4599 false, /* not important */
4600 READ_DONTCARE,
4601 c,
4602 c + 1));
4603
4604 curr_look_right = &(expected_look_right_windows.back());
4605 }
4606 else if (curr_look_right)
4607 {
4608 curr_look_right->right++;
4609 }
4610 }
4611 else
4612 {
4613 if ((cov_state[c-1] & LOOK_RIGHT))
4614 {
4615 curr_look_right = NULL;
4616 }
4617 }
4618 }
4619 }
4620
4621 fprintf(stderr, " Map covers %ld bases\n", covered_bases);
4622 fprintf(stderr, " Map covers %d bases in sufficiently long segments\n", long_enough_bases);
4623 fprintf(stderr, " Map contains %d good islands\n", num_islands + 1);
4624 fprintf(stderr, " %d are left looking bases\n", left_looking);
4625 fprintf(stderr, " %d are right looking bases\n", right_looking);
4626
4627 vector<RefSeg> expected_don_acc_windows;
4628 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
4629 expected_look_right_windows.begin(),
4630 expected_look_right_windows.end());
4631
4632 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
4633 expected_look_left_windows.begin(),
4634 expected_look_left_windows.end());
4635
4636 if (!butterfly_search) coverage_map.clear(); //free some memory
4637 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
4638 expected_don_acc_windows,
4639 cov_juncs,
4640 "GT",
4641 "AG",
4642 max_coverage_intron_length,
4643 min_coverage_intron_length,
4644 max_cov_juncs,
4645 true,
4646 half_splice_mer_len);
4647 //fprintf(stderr, "Found %ld potential island-end pairing junctions\n", (long int)cov_juncs.size());
4648 }
4649
4650 void print_juncs(RefSequenceTable& rt, std::set<Junction, skip_count_lt>& juncs, const char* str)
4651 {
4652 fprintf (stderr, "-- %s --\n", str);
4653 for(std::set<Junction>::iterator itr = juncs.begin();
4654 itr != juncs.end();
4655 ++itr)
4656 {
4657 const char* ref_name = rt.get_name(itr->refid);
4658
4659 fprintf(stderr, "%s\t%d\t%d\t%c\n",
4660 ref_name,
4661 itr->left,
4662 itr->right,
4663 itr->antisense ? '-' : '+');
4664 }
4665 fprintf (stderr, "-- done --\n");
4666 }
4667
4668 struct SegmentSearchWorker
4669 {
4670 void operator()()
4671 {
4672 ReadTable it;
4673
4674 ReadStream readstream(reads_fname);
4675 ReadStream readstream_for_segment_search(reads_fname);
4676 ReadStream readstream_for_indel_discovery(reads_fname);
4677 ReadStream readstream_for_fusion_discovery(reads_fname);
4678
4679 if (readstream.file() == NULL ||
4680 readstream_for_segment_search.file() == NULL ||
4681 readstream_for_indel_discovery.file() == NULL ||
4682 readstream_for_fusion_discovery.file() == NULL)
4683 {
4684 fprintf(stderr, "Error: cannot open %s for reading\n",
4685 reads_fname.c_str());
4686 exit(1);
4687 }
4688
4689 if (read_offset > 0)
4690 {
4691 readstream.seek(read_offset);
4692 readstream_for_segment_search.seek(read_offset);
4693 readstream_for_indel_discovery.seek(read_offset);
4694 readstream_for_fusion_discovery.seek(read_offset);
4695 }
4696
4697 vector<BAMHitFactory*> hit_factories;
4698 hit_factories.push_back(new BAMHitFactory(it, *rt));
4699 HitStream partner_hit_stream_for_segment_search(partner_reads_map_fname, hit_factories.back(), false, false, false);
4700 hit_factories.push_back(new BAMHitFactory(it, *rt));
4701 HitStream partner_hit_stream_for_fusion_discovery(partner_reads_map_fname, hit_factories.back(), false, false, false);
4702 if (partner_hit_offset > 0)
4703 {
4704 partner_hit_stream_for_segment_search.seek(partner_hit_offset);
4705 partner_hit_stream_for_fusion_discovery.seek(partner_hit_offset);
4706 }
4707
4708 hit_factories.push_back(new BAMHitFactory(it, *rt));
4709 HitStream seg_partner_hit_stream_for_segment_search(seg_partner_reads_map_fname, hit_factories.back(), false, false, false);
4710 hit_factories.push_back(new BAMHitFactory(it, *rt));
4711 HitStream seg_partner_hit_stream_for_fusion_discovery(seg_partner_reads_map_fname, hit_factories.back(), false, false, false);
4712 if (seg_partner_hit_offset > 0)
4713 {
4714 seg_partner_hit_stream_for_segment_search.seek(seg_partner_hit_offset);
4715 seg_partner_hit_stream_for_fusion_discovery.seek(seg_partner_hit_offset);
4716 }
4717
4718 vector<HitStream> hit_streams;
4719 for (size_t i = 0; i < segmap_fnames->size(); ++i)
4720 {
4721 hit_factories.push_back(new BAMHitFactory(it, *rt));
4722 HitStream hs((*segmap_fnames)[i], hit_factories.back(), false, false, false);
4723 if (seg_offsets[i] > 0)
4724 hs.seek(seg_offsets[i]);
4725
4726 hit_streams.push_back(hs);
4727 }
4728
4729 int num_group = 0;
4730 uint64_t read_id = 0, last_read_id = 0;
4731 while ((read_id = process_next_hit_group(*rt,
4732 readstream,
4733 readstream_for_segment_search,
4734 readstream_for_indel_discovery,
4735 readstream_for_fusion_discovery,
4736 it,
4737 hit_streams,
4738 hit_streams.size() - 1,
4739 partner_hit_stream_for_segment_search,
4740 seg_partner_hit_stream_for_segment_search,
4741 partner_hit_stream_for_fusion_discovery,
4742 seg_partner_hit_stream_for_fusion_discovery,
4743 *juncs, *deletions, *insertions, *fusions,
4744 read,
4745 begin_id, end_id)) != 0)
4746 {
4747 num_group++;
4748 //if (num_group % 500000 == 0)
4749 // fprintf(stderr, "\tProcessed %d root segment groups\n", num_group);
4750
4751 last_read_id = read_id;
4752 }
4753
4754 // "microaligned_segs" is not protected against multi-threading
4755 // fprintf(stderr, "Microaligned %d segments\n", microaligned_segs);
4756
4757 for (size_t i = 0; i < hit_factories.size(); ++i)
4758 delete hit_factories[i];
4759
4760 hit_factories.clear();
4761 }
4762
4763 RefSequenceTable* rt;
4764 string reads_fname;
4765 vector<string>* segmap_fnames;
4766 string partner_reads_map_fname;
4767 string seg_partner_reads_map_fname;
4768 std::set<Junction, skip_count_lt>* juncs;
4769 std::set<Deletion>* deletions;
4770 std::set<Insertion>* insertions;
4771 FusionSimpleSet* fusions;
4772 eREAD read;
4773
4774 uint64_t begin_id;
4775 uint64_t end_id;
4776 int64_t read_offset;
4777 vector<int64_t> seg_offsets;
4778
4779 int64_t partner_hit_offset;
4780 int64_t seg_partner_hit_offset;
4781 };
4782
4783 struct SpliceJunctionCoord
4784 {
4785 uint32_t refid;
4786 int coord;
4787
4788 SpliceJunctionCoord(uint32_t r, int c) :
4789 refid(r),
4790 coord(c)
4791 {}
4792
4793 bool operator< (const SpliceJunctionCoord& r) const
4794 {
4795 if (refid < r.refid)
4796 return true;
4797 else if (refid == r.refid && coord < r.coord)
4798 return true;
4799 else
4800 return false;
4801 }
4802 };
4803
4804 void driver(istream& ref_stream,
4805 FILE* juncs_out,
4806 FILE* insertions_out,
4807 FILE* deletions_out,
4808 FILE* fusions_out,
4809 string& left_reads_fname,
4810 string& left_reads_map_fname,
4811 vector<string>& left_segmap_fnames,
4812 string& right_reads_fname,
4813 string& right_reads_map_fname,
4814 vector<string>& right_segmap_fnames)
4815 {
4816 if (!parallel)
4817 num_threads = 1;
4818
4819 // turn off parallelization in case of the following search methods
4820 if (!no_coverage_search || !no_microexon_search || butterfly_search)
4821 num_threads = 1;
4822 //fprintf(stderr, ">>>>>>>>>> num_threads = %d\n",num_threads);
4823 assert (num_threads > 0);
4824 if (left_segmap_fnames.size() == 0)
4825 {
4826 fprintf(stderr, "No hits to process, exiting\n");
4827 exit(0);
4828 }
4829
4830 std::set<Junction, skip_count_lt> vseg_juncs[num_threads];
4831 std::set<Junction, skip_count_lt> cov_juncs;
4832 std::set<Junction, skip_count_lt> butterfly_juncs;
4833
4834 std::set<Junction> juncs;
4835
4836 std::set<Deletion> vdeletions[num_threads];
4837 std::set<Insertion> vinsertions[num_threads];
4838 FusionSimpleSet vfusions[num_threads];
4839
4840 RefSequenceTable rt(sam_header, true);
4841
4842 fprintf (stderr, "Loading reference sequences...\n");
4843 get_seqs(ref_stream, rt, true);
4844
4845 string left_seg_fname_for_segment_search = left_segmap_fnames.back();
4846 string right_seg_fname_for_segment_search;
4847 if (right_segmap_fnames.size() > 0)
4848 right_seg_fname_for_segment_search = right_segmap_fnames.back();
4849
4850 fprintf(stderr, ">> Performing segment-search:\n");
4851
4852 if (left_segmap_fnames.size() > 1)
4853 {
4854 fprintf( stderr, "Loading left segment hits...\n");
4855
4856 vector<uint64_t> read_ids;
4857 vector<vector<int64_t> > offsets;
4858 vector<int64_t> partner_offsets;
4859 vector<int64_t> seg_partner_offsets;
4860 if (num_threads > 1)
4861 {
4862 vector<string> fnames;
4863 fnames.push_back(left_reads_fname);
4864 fnames.insert(fnames.end(), left_segmap_fnames.begin(), left_segmap_fnames.end());
4865 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
4866 if (!enough_data)
4867 num_threads = 1;
4868
4869 if (enough_data && right_reads_map_fname != "")
4870 calculate_offsets_from_ids(right_reads_map_fname, read_ids, partner_offsets);
4871
4872 if (enough_data && right_seg_fname_for_segment_search != "")
4873 calculate_offsets_from_ids(right_seg_fname_for_segment_search, read_ids, seg_partner_offsets);
4874 }
4875
4876 vector<boost::thread*> threads;
4877 for (int i = 0; i < num_threads; ++i)
4878 {
4879 SegmentSearchWorker worker;
4880 worker.rt = &rt;
4881 worker.reads_fname = left_reads_fname;
4882 worker.segmap_fnames = &left_segmap_fnames;
4883 worker.partner_reads_map_fname = right_reads_map_fname;
4884 worker.seg_partner_reads_map_fname = right_seg_fname_for_segment_search;
4885 worker.juncs = &vseg_juncs[i];
4886 worker.deletions = &vdeletions[i];
4887 worker.insertions = &vinsertions[i];
4888 worker.fusions = &vfusions[i];
4889 worker.read = READ_LEFT;
4890 worker.partner_hit_offset = 0;
4891 worker.seg_partner_hit_offset = 0;
4892
4893 if (i == 0)
4894 {
4895 worker.begin_id = 0;
4896 worker.seg_offsets = vector<int64_t>(left_segmap_fnames.size(), 0);
4897 worker.read_offset = 0;
4898 }
4899 else
4900 {
4901 worker.begin_id = read_ids[i-1];
4902 worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin()+1, offsets[i-1].end());
4903 worker.read_offset = offsets[i-1][0];
4904 if (partner_offsets.size() > 0)
4905 worker.partner_hit_offset = partner_offsets[i-1];
4906 if (seg_partner_offsets.size() > 0)
4907 worker.seg_partner_hit_offset = seg_partner_offsets[i-1];
4908 }
4909
4910 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
4911 //Geo debug:
4912 //fprintf(stderr, "Worker %d: begin_id=%lu, end_id=%lu\n", i, worker.begin_id, worker.end_id);
4913
4914 if (num_threads > 1)
4915 threads.push_back(new boost::thread(worker));
4916 else
4917 worker();
4918 }
4919
4920 for (size_t i = 0; i < threads.size(); ++i)
4921 {
4922 threads[i]->join();
4923 delete threads[i];
4924 threads[i] = NULL;
4925 }
4926 threads.clear();
4927
4928 fprintf( stderr, "done.\n");
4929 }
4930
4931 if (right_segmap_fnames.size() > 1)
4932 {
4933 fprintf( stderr, "Loading right segment hits...\n");
4934
4935 vector<uint64_t> read_ids;
4936 vector<vector<int64_t> > offsets;
4937 vector<int64_t> partner_offsets;
4938 vector<int64_t> seg_partner_offsets;
4939 if (num_threads > 1)
4940 {
4941 vector<string> fnames;
4942 fnames.push_back(right_reads_fname);
4943 fnames.insert(fnames.end(), right_segmap_fnames.begin(), right_segmap_fnames.end());
4944 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
4945 if (!enough_data)
4946 num_threads = 1;
4947
4948 if (enough_data)
4949 calculate_offsets_from_ids(left_reads_map_fname, read_ids, partner_offsets);
4950
4951 if (enough_data)
4952 calculate_offsets_from_ids(left_seg_fname_for_segment_search, read_ids, seg_partner_offsets);
4953 }
4954
4955 vector<boost::thread*> threads;
4956 for (int i = 0; i < num_threads; ++i)
4957 {
4958 SegmentSearchWorker worker;
4959 worker.rt = &rt;
4960 worker.reads_fname = right_reads_fname;
4961 worker.segmap_fnames = &right_segmap_fnames;
4962 worker.partner_reads_map_fname = left_reads_map_fname;
4963 worker.seg_partner_reads_map_fname = left_seg_fname_for_segment_search;
4964 worker.juncs = &vseg_juncs[i];
4965 worker.deletions = &vdeletions[i];
4966 worker.insertions = &vinsertions[i];
4967 worker.fusions = &vfusions[i];
4968 worker.read = READ_RIGHT;
4969 worker.partner_hit_offset = 0;
4970 worker.seg_partner_hit_offset = 0;
4971
4972 if (i == 0)
4973 {
4974 worker.begin_id = 0;
4975 worker.seg_offsets = vector<int64_t>(left_segmap_fnames.size(), 0);
4976 worker.read_offset = 0;
4977 }
4978 else
4979 {
4980 worker.begin_id = read_ids[i-1];
4981 worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin() + 1, offsets[i-1].end());
4982 worker.read_offset = offsets[i-1][0];
4983 if (partner_offsets.size() > 0)
4984 worker.partner_hit_offset = partner_offsets[i-1];
4985 if (seg_partner_offsets.size() > 0)
4986 worker.seg_partner_hit_offset = seg_partner_offsets[i-1];
4987 }
4988
4989 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
4990
4991 if (num_threads > 1)
4992 threads.push_back(new boost::thread(worker));
4993 else
4994 worker();
4995 }
4996
4997 for (size_t i = 0; i < threads.size(); ++i)
4998 {
4999 threads[i]->join();
5000 delete threads[i];
5001 threads[i] = NULL;
5002 }
5003 threads.clear();
5004 fprintf( stderr, "done.\n");
5005 }
5006
5007 std::set<Junction, skip_count_lt> seg_juncs;
5008 std::set<Deletion> deletions;
5009 std::set<Insertion> insertions;
5010
5011 for (int i = 0; i < num_threads; ++i)
5012 {
5013 seg_juncs.insert(vseg_juncs[i].begin(), vseg_juncs[i].end());
5014 deletions.insert(vdeletions[i].begin(), vdeletions[i].end());
5015 insertions.insert(vinsertions[i].begin(), vinsertions[i].end());
5016 }
5017
5018 FusionSimpleSet fusions = vfusions[0];
5019 for (int i = 1; i < num_threads; ++i)
5020 {
5021 merge_with(fusions, vfusions[i]);
5022 }
5023
5024
5025 fprintf(stderr, "\tfound %ld potential split-segment junctions\n", (long int)seg_juncs.size());
5026 fprintf(stderr, "\tfound %ld potential small deletions\n", (long int)deletions.size());
5027 fprintf(stderr, "\tfound %ld potential small insertions\n", (long int)insertions.size());
5028
5029 vector<string> all_segmap_fnames;
5030 for (vector<string>::size_type i = 0; i != left_segmap_fnames.size();i++) {
5031 all_segmap_fnames.push_back( left_segmap_fnames[i] );
5032 }
5033 for (vector<string>::size_type i = 0; i != right_segmap_fnames.size();i++) {
5034 all_segmap_fnames.push_back( right_segmap_fnames[i] );
5035 }
5036
5037 #if 0
5038 // daehwan - check this out as Cole insists on using segments gives better results.
5039 vector<FZPipe*> all_map_files;
5040 if (left_seg_files.size() > 1)
5041 {
5042 all_map_files.push_back(&left_reads_map_file);
5043 }
5044
5045 if (right_seg_files.size() > 1)
5046 {
5047 all_map_files.push_back(&right_reads_map_file);
5048 }
5049
5050 copy(all_seg_files.begin(), all_seg_files.end(), back_inserter(all_map_files));
5051 #endif
5052
5053 ReadTable it;
5054 map<uint32_t, vector<bool> > coverage_map;
5055 if (!no_coverage_search || butterfly_search)
5056 {
5057 if (ium_reads != "")
5058 {
5059 vector<string> ium_read_files;
5060 tokenize(ium_reads,",", ium_read_files);
5061 /*
5062 vector<FZPipe> iums;
5063 string unzcmd=getUnpackCmd(ium_read_files[0],false); //could be BAM file
5064 for (size_t ium = 0; ium < ium_read_files.size(); ++ium)
5065 {
5066 //fprintf (stderr, "Indexing extensions in %s\n", ium_read_files[ium].c_str());
5067 FZPipe ium_file(ium_read_files[ium],unzcmd);
5068 if (ium_file.file==NULL)
5069 {
5070 fprintf (stderr, "Can't open file %s for reading, skipping...\n",ium_read_files[ium].c_str());
5071 continue;
5072 }
5073 iums.push_back(ium_file);
5074 }
5075 */
5076 index_read_mers(ium_read_files, 5);
5077 }
5078 else
5079 { //no unmapped reads
5080 no_coverage_search = true;
5081 butterfly_search = false;
5082 }
5083
5084 if (!no_coverage_search)
5085 { //coverage search
5086 // looking for junctions by island end pairings
5087 fprintf(stderr, ">> Performing coverage-search:\n");
5088 capture_island_ends(it,
5089 rt,
5090 all_segmap_fnames,
5091 cov_juncs,
5092 coverage_map,
5093 5);
5094 fprintf(stderr, "\tfound %d potential junctions\n",(int)cov_juncs.size());
5095 }
5096 } //coverage search or butterfly search
5097
5098 if (butterfly_search)
5099 {
5100 //looking for junctions between and within islands
5101 fprintf(stderr, ">> Performing butterfly-search: \n");
5102 prune_extension_table(butterfly_overhang);
5103 compact_extension_table();
5104 pair_covered_sites(it,
5105 rt,
5106 all_segmap_fnames,
5107 butterfly_juncs,
5108 coverage_map,
5109 5);
5110
5111 fprintf(stderr, "\tfound %d potential junctions\n",(int)butterfly_juncs.size());
5112 }
5113
5114 coverage_map.clear();
5115
5116 std::set<Junction, skip_count_lt> microexon_juncs;
5117 if (!no_microexon_search)
5118 {
5119 fprintf(stderr, ">> Performing microexon-search: \n");
5120 std::set<Junction, skip_count_lt> microexon_juncs;
5121 align_microexon_segs(rt,
5122 microexon_juncs,
5123 max_cov_juncs,
5124 5);
5125 fprintf(stderr, "\tfound %d potential junctions\n",(int)microexon_juncs.size());
5126 juncs.insert(microexon_juncs.begin(), microexon_juncs.end());
5127 }
5128
5129 juncs.insert(cov_juncs.begin(), cov_juncs.end());
5130 juncs.insert(seg_juncs.begin(), seg_juncs.end());
5131 juncs.insert(butterfly_juncs.begin(), butterfly_juncs.end());
5132
5133 //fprintf(stderr, "Reporting potential splice junctions...");
5134 vector<SpliceJunctionCoord> splice_junction_coords;
5135 for(std::set<Junction>::iterator itr = juncs.begin();
5136 itr != juncs.end();
5137 ++itr)
5138 {
5139 const char* ref_name = rt.get_name(itr->refid);
5140
5141 fprintf(juncs_out,
5142 "%s\t%d\t%d\t%c\n",
5143 ref_name,
5144 itr->left,
5145 itr->right,
5146 itr->antisense ? '-' : '+');
5147
5148 if (fusion_search)
5149 {
5150 splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->left));
5151 splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->right));
5152 }
5153 }
5154 //close all reading pipes, just to exit cleanly
5155 /*
5156 for (vector<FZPipe*>::size_type i = 0; i != all_segmap_fnames.size();i++) {
5157 all_segmap_fnames[i]->close();
5158 }
5159 */
5160 fprintf(stderr, "Reported %d total potential splices\n", (int)juncs.size());
5161 sort(splice_junction_coords.begin(), splice_junction_coords.end());
5162
5163 fprintf(stderr, "Reporting %u potential deletions...\n", deletions.size());
5164 if(deletions_out){
5165 for(std::set<Deletion>::iterator itr = deletions.begin(); itr != deletions.end(); ++itr){
5166 const char* ref_name = rt.get_name(itr->refid);
5167 /*
5168 * We fix up the left co-ordinate to reference the first deleted base
5169 */
5170 fprintf(deletions_out,
5171 "%s\t%d\t%d\n",
5172 ref_name,
5173 itr->left + 1,
5174 itr->right);
5175 }
5176 fclose(deletions_out);
5177 }else{
5178 fprintf(stderr, "Failed to open deletions file for writing, no deletions reported\n");
5179 }
5180
5181 fprintf(stderr, "Reporting %u potential insertions...\n", insertions.size());
5182 if(insertions_out){
5183 for(std::set<Insertion>::iterator itr = insertions.begin(); itr != insertions.end(); ++itr){
5184 const char* ref_name = rt.get_name(itr->refid);
5185 fprintf(insertions_out,
5186 "%s\t%d\t%d\t%s\n",
5187 ref_name,
5188 itr->left,
5189 itr->left,
5190 itr->sequence.c_str());
5191 }
5192 fclose(insertions_out);
5193 }else{
5194 fprintf(stderr, "Failed to open insertions file for writing, no insertions reported\n");
5195 }
5196 if (fusions_out)
5197 {
5198 // check if a fusion point coincides with splice junctions.
5199 for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
5200 {
5201 const Fusion& fusion = itr->first;
5202 FusionSimpleStat& fusion_stat = itr->second;
5203
5204 bool found = binary_search(splice_junction_coords.begin(),
5205 splice_junction_coords.end(),
5206 SpliceJunctionCoord(fusion.refid1, fusion.left));
5207 if (found)
5208 fusion_stat.left_coincide_with_splice_junction = true;
5209
5210 found = binary_search(splice_junction_coords.begin(),
5211 splice_junction_coords.end(),
5212 SpliceJunctionCoord(fusion.refid2, fusion.right));
5213
5214 if (found)
5215 fusion_stat.right_coincide_with_splice_junction = true;
5216 }
5217
5218 for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
5219 {
5220 const Fusion& fusion = itr->first;
5221 const FusionSimpleStat& fusion_stat = itr->second;
5222
5223 // compare the current fusion with the next fusion, pick up the better one.
5224 FusionSimpleSet::iterator next_itr = itr; ++next_itr;
5225 while (next_itr != fusions.end())
5226 {
5227 const Fusion& next_fusion = next_itr->first;
5228 const FusionSimpleStat& next_fusion_stat = next_itr->second;
5229
5230 uint32_t left_diff = abs((int)fusion.left - (int)next_fusion.left);
5231 if (fusion.refid1 == next_fusion.refid1 && fusion.refid2 == next_fusion.refid2 && left_diff < 10)
5232 {
5233 if (fusion.dir == next_fusion.dir && left_diff == abs((int)fusion.right - (int)next_fusion.right))
5234 {
5235 if (next_fusion_stat.count > fusion_stat.count)
5236 itr->second.skip = true;
5237 else if (next_fusion_stat.count == fusion_stat.count)
5238 {
5239 int curr_count = (int)fusion_stat.left_coincide_with_splice_junction + (int)fusion_stat.right_coincide_with_splice_junction;
5240 int next_count = (int)next_fusion_stat.left_coincide_with_splice_junction + (int)next_fusion_stat.right_coincide_with_splice_junction;
5241
5242 if (curr_count < next_count)
5243 itr->second.skip = true;
5244 else
5245 next_itr->second.skip = true;
5246 }
5247 else
5248 next_itr->second.skip = true;
5249 }
5250
5251 ++next_itr;
5252 }
5253 else
5254 break;
5255 }
5256
5257 if (itr->second.skip && !fusion_do_not_resolve_conflicts)
5258 continue;
5259
5260 const char* ref_name1 = rt.get_name(fusion.refid1);
5261 const char* ref_name2 = rt.get_name(fusion.refid2);
5262
5263 const char* dir = "";
5264 if (fusion.dir == FUSION_FR)
5265 dir = "fr";
5266 else if(fusion.dir == FUSION_RF)
5267 dir = "rf";
5268 else if(fusion.dir == FUSION_RR)
5269 dir = "rr";
5270 else
5271 dir = "ff";
5272
5273 fprintf(fusions_out,
5274 "%s\t%d\t%s\t%d\t%s\n",
5275 ref_name1,
5276 fusion.left,
5277 ref_name2,
5278 fusion.right,
5279 dir);
5280 }
5281 fclose(fusions_out);
5282 }
5283 fprintf(stderr, "Reporting potential fusions...\n");
5284 }
5285
5286 int main(int argc, char** argv)
5287 {
5288 fprintf(stderr, "segment_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
5289 fprintf(stderr, "---------------------------\n");
5290
5291 int parse_ret = parse_options(argc, argv, print_usage);
5292 if (parse_ret)
5293 return parse_ret;
5294
5295 if(optind >= argc)
5296 {
5297 print_usage();
5298 return 1;
5299 }
5300
5301 string ref_file_name = argv[optind++];
5302
5303 if(optind >= argc)
5304 {
5305 print_usage();
5306 return 1;
5307 }
5308
5309 string juncs_file_name = argv[optind++];
5310
5311 if(optind >= argc)
5312 {
5313 print_usage();
5314 return 1;
5315 }
5316
5317 string insertions_file_name = argv[optind++];
5318 if(optind >= argc)
5319 {
5320 print_usage();
5321 return 1;
5322 }
5323
5324 string deletions_file_name = argv[optind++];
5325 if(optind >= argc)
5326 {
5327 print_usage();
5328 return 1;
5329 }
5330
5331 string fusions_file_name = argv[optind++];
5332 if(optind >= argc)
5333 {
5334 print_usage();
5335 return 1;
5336 }
5337
5338 string left_reads_file_name = argv[optind++];
5339
5340 if(optind >= argc)
5341 {
5342 print_usage();
5343 return 1;
5344 }
5345
5346 string left_reads_map_file_name = argv[optind++];
5347
5348 if(optind >= argc)
5349 {
5350 print_usage();
5351 return 1;
5352 }
5353
5354 string left_segment_map_file_list = argv[optind++];
5355
5356 string right_segment_map_file_list;
5357 string right_reads_file_name;
5358 string right_reads_map_file_name;
5359 if (optind < argc)
5360 {
5361 right_reads_file_name = argv[optind++];
5362
5363 if(optind >= argc)
5364 {
5365 print_usage();
5366 return 1;
5367 }
5368
5369 right_reads_map_file_name = argv[optind++];
5370
5371 if(optind >= argc)
5372 {
5373 print_usage();
5374 return 1;
5375 }
5376 right_segment_map_file_list = argv[optind++];
5377 }
5378
5379 // Open the approppriate files
5380
5381 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
5382 if (!ref_stream.good())
5383 {
5384 fprintf(stderr, "Error: cannot open %s for reading\n",
5385 ref_file_name.c_str());
5386 exit(1);
5387 }
5388
5389
5390 FILE* juncs_file = fopen(juncs_file_name.c_str(), "w");
5391 if (!juncs_file)
5392 {
5393 fprintf(stderr, "Error: cannot open %s for writing\n",
5394 juncs_file_name.c_str());
5395 exit(1);
5396 }
5397
5398 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
5399 if (!insertions_file)
5400 {
5401 fprintf(stderr, "Error: cannot open %s for writing\n",
5402 insertions_file_name.c_str());
5403 exit(1);
5404 }
5405
5406
5407 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
5408 if (!deletions_file)
5409 {
5410 fprintf(stderr, "Error: cannot open %s for writing\n",
5411 deletions_file_name.c_str());
5412 exit(1);
5413 }
5414
5415 vector<string> left_segment_map_fnames;
5416 string left_segment_file_for_segment_search;
5417 tokenize(left_segment_map_file_list, ",",left_segment_map_fnames);
5418
5419 FILE* fusions_file = fopen(fusions_file_name.c_str(), "w");
5420 if (!fusions_file)
5421 {
5422 fprintf(stderr, "Error: cannot open %s for writing\n",
5423 fusions_file_name.c_str());
5424 exit(1);
5425 }
5426
5427 //FILE* left_reads_file = fopen(left_reads_file_name.c_str(), "r");
5428 //FILE* left_reads_file_for_indel_discovery = fopen(left_reads_file_name.c_str(),"r");
5429 string unzcmd=getUnpackCmd(left_reads_file_name, false);
5430 FZPipe left_reads_file(left_reads_file_name, unzcmd);
5431 FZPipe left_reads_file_for_segment_search(left_reads_file_name, unzcmd);
5432 FZPipe left_reads_file_for_indel_discovery(left_reads_file_name, unzcmd);
5433 FILE* left_reads_file_for_fusion_discovery = fopen(left_reads_file_name.c_str(),"r");
5434 if (left_reads_file.file==NULL || left_reads_file_for_segment_search.file==NULL ||
5435 left_reads_file_for_indel_discovery.file==NULL || left_reads_file_for_fusion_discovery==NULL)
5436 {
5437 fprintf(stderr, "Error: cannot open %s for reading\n",
5438 left_reads_file_name.c_str());
5439 exit(1);
5440 }
5441
5442 vector<string> right_segment_map_fnames;
5443 string right_segment_file_for_segment_search;
5444 if (right_segment_map_file_list != "")
5445 {
5446 tokenize(right_segment_map_file_list, ",", right_segment_map_fnames);
5447 }
5448
5449 // min_cov_length=20;
5450 if (min_cov_length>segment_length-2) min_cov_length=segment_length-2;
5451 driver(ref_stream,
5452 juncs_file,
5453 insertions_file,
5454 deletions_file,
5455 fusions_file,
5456 left_reads_file_name,
5457 left_reads_map_file_name,
5458 left_segment_map_fnames,
5459 right_reads_file_name,
5460 right_reads_map_file_name,
5461 right_segment_map_fnames);
5462
5463 return 0;
5464 }