ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/segment_juncs.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 1 month ago) by gpertea
File size: 119886 byte(s)
Log Message:
adding tophat source work

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * segment_juncs.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 //#include <stdexcept>
22 #include <iostream>
23 #include <fstream>
24 #include <sstream>
25 #include <cstring>
26 #include <bitset>
27 #include <seqan/sequence.h>
28 #include <seqan/find.h>
29 #include <seqan/file.h>
30 #include <seqan/modifier.h>
31 #include <getopt.h>
32
33 #include "common.h"
34 #include "bwt_map.h"
35 #include "tokenize.h"
36 #include "segments.h"
37 #include "reads.h"
38 #include "junctions.h"
39 #include "insertions.h"
40 #include "deletions.h"
41
42 using namespace seqan;
43 using namespace std;
44 using namespace __gnu_cxx;
45
46 // daehwan
47 bool bDebug = false;
48
49 void print_usage()
50 {
51 fprintf(stderr, "Usage: segment_juncs <ref.fa> <segment.juncs> <segment.insertions> <segment.deletions> <left_reads.fq> <left_seg1.bwtout,...,segN.bwtout> [right_reads.fq right_seg1.bwtout,...,right_segN.bwtout]\n");
52 }
53
54 // This is the maximum number of bowtie mismatches allower per segment hit
55 static const int num_bowtie_mismatches = 2;
56
57 static const int max_cov_juncs = 5000000;
58 static const int max_seg_juncs = 10000000;
59 int max_microexon_stretch = 2000;
60 int butterfly_overhang = 6;
61
62 void get_seqs(istream& ref_stream,
63 RefSequenceTable& rt,
64 bool keep_seqs = true,
65 bool strip_slash = false)
66 {
67 while(ref_stream.good() &&
68 !ref_stream.eof())
69 {
70 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
71 string name;
72 readMeta(ref_stream, name, Fasta());
73 string::size_type space_pos = name.find_first_of(" \t\r");
74 if (space_pos != string::npos)
75 {
76 name.resize(space_pos);
77 }
78 fprintf(stderr, "\tLoading %s...", name.c_str());
79 seqan::read(ref_stream, *ref_str, Fasta());
80 fprintf(stderr, "done\n");
81 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
82 if (!keep_seqs)
83 delete ref_str;
84 }
85 }
86
87
88 RefSeg seg_from_bowtie_hit(const BowtieHit& T)
89 {
90 RefSeg r_seg(T.ref_id(), POINT_DIR_DONTCARE, T.antisense_align(), READ_DONTCARE, 0, 0);
91
92 if (T.antisense_align())
93 {
94 r_seg.left = max(0, T.right() - 2);
95 r_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
96 r_seg.points_where = POINT_DIR_RIGHT;
97 }
98 else
99 {
100 r_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
101 r_seg.right = T.left() + 2; // num allowed bowtie mismatches
102 r_seg.points_where = POINT_DIR_LEFT;
103 }
104
105 return r_seg;
106 }
107
108 pair<RefSeg, RefSeg> segs_from_bowtie_hits(const BowtieHit& T,
109 const BowtieHit& H)
110 {
111 pair<RefSeg, RefSeg> seg_pair;
112 if (H.antisense_align() == false &&
113 abs((H.right() + 1) - T.left()) < (int)max_segment_intron_length)
114 {
115 RefSeg left_seg(H.ref_id(), POINT_DIR_RIGHT, H.antisense_align(), READ_DONTCARE, 0, 0);
116 left_seg.left = max(0, H.right() - 2);
117 left_seg.right = H.right() + (H.right() - H.left() + 1); // num allowed bowtie mismatches
118
119 RefSeg right_seg(T.ref_id(), POINT_DIR_LEFT, T.antisense_align(), READ_DONTCARE, 0, 0);
120 right_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
121 right_seg.right = T.left() + 2; // num allowed bowtie mismatches
122
123 seg_pair = make_pair(left_seg, right_seg);
124 }
125 else if (H.antisense_align() == true &&
126 abs((T.right() + 1) - H.left()) < (int)max_segment_intron_length)
127 {
128 RefSeg left_seg(T.ref_id(), POINT_DIR_RIGHT, T.antisense_align(), READ_DONTCARE, 0, 0);
129 left_seg.left = max(0, T.right() - 2);
130 left_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
131
132 RefSeg right_seg(H.ref_id(), POINT_DIR_LEFT, H.antisense_align(), READ_DONTCARE, 0, 0);
133 right_seg.left = max(0, H.left() - (H.right() - H.left() + 1));
134 right_seg.right = H.left() + 2; // num allowed bowtie mismatches
135
136 seg_pair = make_pair(left_seg, right_seg);
137 }
138 return seg_pair;
139 }
140
141 //static const size_t half_splice_mer_len = 6;
142 //static const size_t splice_mer_len = 2 * half_splice_mer_len;
143
144 struct MerExtension
145 {
146 static const int MAX_EXTENSION_BP = 14;
147 uint32_t left_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
148 uint8_t left_ext_len : 4; // how many bases in this extension
149
150 uint32_t right_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
151 uint8_t right_ext_len : 4; // how many bases in this extension
152
153 MerExtension() : left_dna_str(0), left_ext_len(0), right_dna_str(0), right_ext_len(0) {}
154
155 bool operator<(const MerExtension& rhs) const
156 {
157 if (left_dna_str != rhs.left_dna_str)
158 return left_dna_str < rhs.left_dna_str;
159 if (left_ext_len != rhs.left_ext_len)
160 return left_ext_len < rhs.left_ext_len;
161 if (right_dna_str != rhs.right_dna_str)
162 return right_dna_str < rhs.right_dna_str;
163 if (right_ext_len != rhs.right_ext_len)
164 return right_ext_len < rhs.right_ext_len;
165 return false;
166 }
167
168 bool operator==(const MerExtension& rhs) const
169 {
170 bool eq = left_dna_str == rhs.left_dna_str &&
171 right_dna_str == rhs.right_dna_str &&
172 left_ext_len == rhs.left_ext_len &&
173 right_ext_len == rhs.right_ext_len;
174
175 return eq;
176 }
177 };
178
179
180 /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
181 /// T=3, N=4
182 uint8_t charToDna5[] = {
183 /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184 /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
186 /* 48 */ 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 /* 64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
188 /* A C G N */
189 /* 80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* T */
191 /* 96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
192 /* a c g n */
193 /* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
194 /* t */
195 /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
196 /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 };
204
205
206 typedef vector<vector<MerExtension> > MerExtensionTable;
207 typedef vector<uint32_t> MerExtensionCounts;
208
209 MerExtensionTable extensions;
210 MerExtensionCounts extension_counts;
211
212 uint64_t dna5str_to_idx(const string& str)
213 {
214 uint64_t idx = 0;
215
216 for (size_t i = 0; i < str.length(); ++i)
217 {
218 idx <<=2;
219 char c = toupper(str[i]);
220 idx |= (0x3 & charToDna5[(size_t)c]);
221 }
222 return idx;
223 }
224
225 uint64_t colorstr_to_idx(const string& str)
226 {
227 uint64_t idx = 0;
228
229 for (size_t i = 0; i < str.length(); ++i)
230 {
231 idx <<=2;
232 char c = str[i];
233 idx |= (0x3 & charToDna5[(size_t)c]);
234 }
235 return idx;
236 }
237
238 void store_read_extensions(MerExtensionTable& ext_table,
239 int seq_key_len,
240 int min_ext_len,
241 const string& seq,
242 bool use_precount_table)
243 {
244 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
245 // this read.
246
247 uint64_t seed = 0;
248 bitset<256> left = 0;
249 bitset<256> right = 0;
250 const char* p = seq.c_str();
251
252 unsigned int seq_len = (int)seq.length();
253 const char* seq_end = p + seq_len;
254
255 // Build the first seed
256 while (p < seq.c_str() + (2 * seq_key_len))
257 {
258 seed <<= 2;
259 seed |= (0x3 & charToDna5[(size_t)*p]);
260 ++p;
261 }
262
263 // Build the rest of them with a sliding window, adding successive bases
264 // to the "right-remainder" word.
265 while (p < seq_end)
266 {
267 right <<= 2;
268 right |= (0x3 & charToDna5[(size_t)*p]);
269 ++p;
270 }
271
272 // This loop will construct successive seed, along with 32-bit words
273 // containing the left and right remainders for each seed
274 uint32_t i = 0;
275 size_t new_hits = 0;
276 do
277 {
278 // How many base pairs exist in the right remainder beyond what we have
279 // space for ?
280 int extra_right_bp = ((int)seq.length() -
281 (i + 2 * seq_key_len)) - MerExtension::MAX_EXTENSION_BP;
282
283 uint32_t hit_right = 0;
284 if (extra_right_bp > 0)
285 {
286 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
287 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
288 }
289 else
290 {
291 hit_right = (uint32_t)right.to_ulong();
292 }
293
294 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
295
296 //size_t prev_cap = (*mer_table)[seed].capacity();
297 //(*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
298 //cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
299
300 MerExtension ext;
301
302 ext.right_dna_str = hit_right;
303 ext.right_ext_len = min(seq_len - (2 * seq_key_len) - i,
304 (unsigned int)MerExtension::MAX_EXTENSION_BP);
305
306 ext.left_dna_str = hit_left;
307 ext.left_ext_len = min(i, (unsigned int)MerExtension::MAX_EXTENSION_BP);
308 if (use_precount_table)
309 {
310 int curr_seed = --extension_counts[seed];
311 if (curr_seed < 0 || curr_seed > (int)ext_table[seed].size())
312 {
313 fprintf(stderr, "Error: curr_seed is %d, max is %lu\n", curr_seed, (long unsigned int)ext_table[seed].size());
314 }
315
316 ext_table[seed][curr_seed] = ext;
317 }
318 else
319 {
320 ext_table[seed].push_back(ext);
321 }
322 new_hits++;
323
324 // Take the leftmost base of the seed and stick it into bp
325 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
326
327 // Move that base down to the least significant bits of bp
328 bp >>= ((seq_key_len << 2) - 2);
329
330 // And tack it onto the left remainder of the read
331 left <<= 2;
332 left |= bp;
333
334 // Now take the leftmost base of the right remainder and stick it into
335 // the rightmost position of the seed
336 uint32_t right_len = seq_len - (i + seq_key_len * 2);
337 //bp = right & (0x3uLL << ((right_len - 1) << 1));
338
339 seed <<= 2;
340 //cout << right << endl;
341 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
342 //cout <<tmp_right << endl;
343 seed |= tmp_right.to_ulong();
344 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
345
346
347 //Now remove that leftmost base of the right remainder
348 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
349 if (right_len)
350 {
351 right.set(((right_len - 1) << 1), 0);
352 right.set(((right_len - 1) << 1) + 1, 0);
353 }
354 ++i;
355
356 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
357 return;
358 }
359
360 void count_read_extensions(MerExtensionCounts& ext_counts,
361 int seq_key_len,
362 int min_ext_len,
363 const string& seq)
364 {
365 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
366 // this read.
367
368 uint64_t seed = 0;
369 bitset<256> left = 0;
370 bitset<256> right = 0;
371 const char* p = seq.c_str();
372
373 unsigned int seq_len = (int)seq.length();
374 const char* seq_end = p + seq_len;
375
376 // Build the first seed
377 while (p < seq.c_str() + (2 * seq_key_len))
378 {
379 seed <<= 2;
380 seed |= (0x3 & charToDna5[(size_t)*p]);
381 ++p;
382 }
383
384 // Build the rest of them with a sliding window, adding successive bases
385 // to the "right-remainder" word.
386 while (p < seq_end)
387 {
388 right <<= 2;
389 right |= (0x3 & charToDna5[(size_t)*p]);
390 ++p;
391 }
392
393 // This loop will construct successive seed, along with 32-bit words
394 // containing the left and right remainders for each seed
395 uint32_t i = 0;
396 size_t new_hits = 0;
397 do
398 {
399 ext_counts[seed]++;
400
401 new_hits++;
402
403 // Take the leftmost base of the seed and stick it into bp
404 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
405
406 // Move that base down to the least significant bits of bp
407 bp >>= ((seq_key_len << 2) - 2);
408
409 // And tack it onto the left remainder of the read
410 left <<= 2;
411 left |= bp;
412
413 // Now take the leftmost base of the right remainder and stick it into
414 // the rightmost position of the seed
415 uint32_t right_len = seq_len - (i + seq_key_len * 2);
416 //bp = right & (0x3uLL << ((right_len - 1) << 1));
417
418 seed <<= 2;
419 //cout << right << endl;
420 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
421 //cout <<tmp_right << endl;
422 seed |= tmp_right.to_ulong();
423 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
424
425
426 //Now remove that leftmost base of the right remainder
427 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
428 if (right_len)
429 {
430 right.set(((right_len - 1) << 1), 0);
431 right.set(((right_len - 1) << 1) + 1, 0);
432 }
433 ++i;
434
435 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
436 return;
437 }
438
439 //void count_read_mers(FILE* reads_file, size_t half_splice_mer_len)
440 void count_read_mers(FZPipe& reads_file, size_t half_splice_mer_len)
441 {
442 Read read;
443 size_t splice_mer_len = 2 * half_splice_mer_len;
444 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
445 extension_counts.resize(mer_table_size);
446 FLineReader fr(reads_file);
447 //while(!feof(reads_file))
448 while (!fr.isEof())
449 {
450 read.clear();
451
452 // Get the next read from the file
453 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
454 break;
455 if (reads_format == FASTQ)
456 {
457 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
458 break;
459 }
460
461 if (color)
462 // erase the primer and the adjacent color
463 read.seq.erase(0, 2);
464
465 if (read.seq.size() > 32)
466 read.seq.resize(32);
467
468 count_read_extensions(extension_counts,
469 half_splice_mer_len,
470 half_splice_mer_len,
471 read.seq);
472 }
473
474 //rewind(reads_file);
475 reads_file.rewind();
476 }
477
478 void compact_extension_table()
479 {
480 for (size_t i = 0; i < extensions.size(); ++i)
481 {
482 vector<MerExtension>& exts = extensions[i];
483 sort(exts.begin(), exts.end());
484 vector<MerExtension>::iterator new_end = unique(exts.begin(), exts.end());
485 exts.erase(new_end, exts.end());
486 vector<MerExtension>(exts).swap(exts);
487 }
488 }
489
490 void prune_extension_table(uint8_t max_extension_bp)
491 {
492 uint32_t mask = ~(0xFFFFFFFFuLL << (max_extension_bp << 1));
493
494 for (size_t i = 0; i < extensions.size(); ++i)
495 {
496 vector<MerExtension>& exts = extensions[i];
497 for (size_t j = 0; j < exts.size(); ++j)
498 {
499 MerExtension& ex = exts[j];
500 if (ex.left_ext_len > max_extension_bp)
501 {
502 ex.left_ext_len = max_extension_bp;
503 ex.left_dna_str &= mask;
504 }
505
506 if (ex.right_ext_len > max_extension_bp)
507 {
508 ex.right_dna_str >>= ((ex.right_ext_len - max_extension_bp) << 1);
509 ex.right_ext_len = max_extension_bp;
510 }
511 }
512 }
513 }
514
515 //void store_read_mers(FILE* reads_file, size_t half_splice_mer_len)
516 void store_read_mers(FZPipe& reads_file, size_t half_splice_mer_len)
517 {
518 Read read;
519 size_t splice_mer_len = 2 * half_splice_mer_len;
520
521 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
522 extensions.resize(mer_table_size);
523
524 size_t num_indexed_reads = 0;
525 FLineReader fr(reads_file);
526 //while(!feof(reads_file))
527 while(!fr.isEof())
528 {
529 read.clear();
530
531 // Get the next read from the file
532 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
533 break;
534 if (reads_format == FASTQ)
535 {
536 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
537 break;
538 }
539
540 if (color)
541 // erase the primer and the adjacent color
542 read.seq.erase(0, 2);
543
544 if (read.seq.size() > 32)
545 read.seq.resize(32);
546
547 store_read_extensions(extensions,
548 half_splice_mer_len,
549 half_splice_mer_len,
550 read.seq,
551 true);
552
553 // Do NOT index the reverse of the reads
554
555 num_indexed_reads++;
556 if (num_indexed_reads % 1000000 == 0)
557 {
558 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads);
559 //compact_extension_table();
560 }
561 }
562
563 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads)
564
565 uint64_t num_extensions = 0;
566 for (size_t i = 0; i < extensions.size(); ++i)
567 {
568 num_extensions += extensions[i].size();
569 }
570 fprintf (stderr, "Total extensions: %lu\n", (long unsigned int)num_extensions);
571 //rewind(reads_file);
572 reads_file.rewind();
573 }
574
575 //void index_read_mers(vector<FILE*> reads_files,
576 void index_read_mers(vector<FZPipe>& reads_files,
577 size_t half_splice_mer_len)
578 {
579 extensions.clear();
580 for (size_t i = 0; i < reads_files.size(); ++i)
581 {
582 count_read_mers(reads_files[i], half_splice_mer_len);
583 }
584
585 extensions.resize(extension_counts.size());
586
587 for (size_t i = 0; i < extension_counts.size(); ++i)
588 {
589 extensions[i].resize(extension_counts[i]);
590 }
591
592 for (size_t i = 0; i < reads_files.size(); ++i)
593 {
594 store_read_mers(reads_files[i], half_splice_mer_len);
595 }
596
597 compact_extension_table();
598 }
599
600 /** Returns the number of characters in strings w1 and w2 that match,
601 * starting from right to left
602 */
603 int get_matching_chars(uint32_t w1, uint32_t w2)
604 {
605 //find the least significant mismatching bit between w1 and w2
606 int mismatch_bit = ffs(w1 ^ w2);
607
608 // If there is no mismatching bit, the words are equal
609 if (!mismatch_bit)
610 return -1;
611
612 // Given the mismatching bit, determine where the mismatching base is
613 mismatch_bit -= 1;
614 mismatch_bit -= ((mismatch_bit) & 1);
615 mismatch_bit >>= 1;
616
617 // Return the number of matching characters.
618 return mismatch_bit;
619 }
620
621 /**
622 * Computes the Hamming distance between two 16bp words, up to a specified
623 * maximum number of mismatches.
624 */
625 uint32_t mismatching_bases(uint32_t w1_word,
626 uint32_t w2_word,
627 int len,
628 uint32_t max_mis)
629 {
630 uint32_t diffs = 0;
631
632 int shift = 0;
633 int L = len;
634 uint32_t misses = 0;
635
636 // While we haven't yet exceeded the maximum allowable mismatches,
637 // and there are still unaligned bases, keep shift-anding
638 while (shift < len && misses <= max_mis)
639 {
640 int match_chars = 0;
641
642 // Get the number of characters matching on the right sides of
643 // both words
644 match_chars = get_matching_chars(w1_word, w2_word);
645
646 // If they are equal for this shift, we are done,
647 // the loop will stop at the next iteration
648 if (match_chars == -1 || match_chars >= len)
649 {
650 match_chars = len;
651 shift = len;
652 }
653 else
654 {
655 // If there is a mismatch in the remaining words
656 // decide how much to shift by and try again
657 match_chars = min(len, match_chars);
658 int shift_chars = (match_chars + 1);
659
660 L -= shift_chars;
661
662 int shift_bits = shift_chars << 1;
663
664 // Shift right past the matching part and the first mismatching base
665 w1_word >>= (shift_bits);
666 w2_word >>= (shift_bits);
667
668 shift += shift_chars;
669 diffs++;
670 misses++;
671 }
672 }
673
674 return diffs;
675 }
676
677 uint64_t rc_dna_str(uint64_t dna_str)
678 {
679 dna_str = ~dna_str;
680 uint64_t rc = 0;
681 for (int i = 0; i < 32; i++)
682 {
683 rc <<= 2;
684 rc |= (dna_str & 0x3);
685 dna_str >>= 2;
686 }
687 return rc;
688 }
689
690 uint64_t rc_color_str(uint64_t color_str)
691 {
692 uint64_t rc = 0;
693 for (int i = 0; i < 32; ++i)
694 {
695 rc <<= 2;
696 rc |= (color_str & 0x3);
697 color_str >>= 2;
698 }
699 return rc;
700 }
701
702 struct DnaSpliceStrings
703 {
704 DnaSpliceStrings(uint64_t f, uint64_t r) : fwd_string(f), rev_string(r), first_in_string('N'), last_in_string('N') {}
705 uint64_t fwd_string;
706 uint64_t rev_string;
707
708 // for color-space purposes
709 char first_in_string;
710 char last_in_string;
711
712 bool operator<(const DnaSpliceStrings& rhs) const
713 {
714 if (fwd_string != rhs.fwd_string)
715 return fwd_string < rhs.fwd_string;
716 if (rev_string != rhs.rev_string)
717 return rev_string < rhs.rev_string;
718 return false;
719 }
720
721 bool operator==(const DnaSpliceStrings& rhs) const
722 {
723 return fwd_string == rhs.fwd_string && rev_string == rhs.rev_string;
724 }
725 };
726
727 struct IntronMotifs
728 {
729 IntronMotifs(uint32_t rid) : ref_id(rid) {}
730 uint32_t ref_id;
731
732 vector<pair<size_t, DnaSpliceStrings> > fwd_donors;
733 vector<pair<size_t, DnaSpliceStrings> > fwd_acceptors;
734 vector<pair<size_t, DnaSpliceStrings> > rev_donors;
735 vector<pair<size_t, DnaSpliceStrings> > rev_acceptors;
736
737 void unique(vector<pair<size_t, DnaSpliceStrings> >& f)
738 {
739 sort(f.begin(), f.end());
740 vector<pair<size_t, DnaSpliceStrings> >::iterator i = std::unique(f.begin(), f.end());
741 f.erase(i, f.end());
742 }
743
744 void unique()
745 {
746 unique(fwd_donors);
747 unique(fwd_acceptors);
748 unique(rev_donors);
749 unique(rev_acceptors);
750 }
751
752 void attach_mers(RefSequenceTable::Sequence& ref_str)
753 {
754 attach_upstream_mers(ref_str, fwd_donors);
755 attach_upstream_mers(ref_str, rev_acceptors);
756
757 attach_downstream_mers(ref_str, rev_donors);
758 attach_downstream_mers(ref_str, fwd_acceptors);
759 }
760
761 void attach_upstream_mers(RefSequenceTable::Sequence& ref_str,
762 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
763 {
764 for (size_t i = 0; i < dinucs.size(); ++i)
765 {
766 size_t pos = dinucs[i].first;
767 int half_splice_mer_len = 32;
768
769 if (color)
770 {
771 if (pos <= (size_t)half_splice_mer_len+1 || pos >= length(ref_str))
772 continue;
773
774 Dna5String seg_str = seqan::infixWithLength(ref_str,
775 pos - half_splice_mer_len - 1,
776 half_splice_mer_len + 1);
777 stringstream ss(stringstream::in | stringstream::out);
778 string s;
779 ss << seg_str;
780 ss >> s;
781
782 string col_seg_str = convert_bp_to_color(s, true);
783 uint64_t idx = colorstr_to_idx(col_seg_str);
784
785 dinucs[i].second.fwd_string = idx;
786 dinucs[i].second.rev_string = rc_color_str(idx);
787 dinucs[i].second.first_in_string = s[1];
788 dinucs[i].second.last_in_string = s[half_splice_mer_len];
789 }
790 else
791 {
792 if (pos <= (size_t)half_splice_mer_len || pos >= length(ref_str))
793 continue;
794
795 Dna5String seg_str = seqan::infixWithLength(ref_str,
796 pos - half_splice_mer_len,
797 half_splice_mer_len);
798
799 stringstream ss(stringstream::in | stringstream::out);
800 string s;
801 ss << seg_str;
802 ss >> s;
803 uint64_t idx = dna5str_to_idx(s);
804 dinucs[i].second.fwd_string = idx;
805 dinucs[i].second.rev_string = rc_dna_str(idx);
806 }
807 }
808 }
809
810
811 void attach_downstream_mers(RefSequenceTable::Sequence& ref_str,
812 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
813 {
814 for (size_t i = 0; i < dinucs.size(); ++i)
815 {
816 size_t pos = dinucs[i].first;
817
818 int half_splice_mer_len = 32;
819
820 if (pos + 2 + half_splice_mer_len >= length(ref_str))
821 continue;
822
823 if (color)
824 {
825 Dna5String seg_str = seqan::infixWithLength(ref_str,
826 pos + 2 - 1,
827 half_splice_mer_len + 1);
828 stringstream ss(stringstream::in | stringstream::out);
829 string s;
830 ss << seg_str;
831 ss >> s;
832
833 string col_seg_str = convert_bp_to_color(s, true);
834 uint64_t idx = colorstr_to_idx(col_seg_str);
835
836 dinucs[i].second.fwd_string = idx;
837 dinucs[i].second.rev_string = rc_color_str(idx);
838 dinucs[i].second.first_in_string = s[1];
839 dinucs[i].second.last_in_string = s[half_splice_mer_len];
840 }
841 else
842 {
843 Dna5String seg_str = seqan::infixWithLength(ref_str,
844 pos + 2,
845 half_splice_mer_len);
846
847 stringstream ss(stringstream::in | stringstream::out);
848 string s;
849 ss << seg_str;
850 ss >> s;
851 uint64_t idx = dna5str_to_idx(s);
852
853 dinucs[i].second.fwd_string = idx;
854 dinucs[i].second.rev_string = rc_dna_str(idx);
855 }
856 }
857 }
858 };
859
860 struct PackedSplice
861 {
862 PackedSplice() : left(0u), seed(0u), right(0u) {}
863 PackedSplice(uint32_t l, uint64_t s, uint32_t r) : left(l), seed(s), right(r) {}
864 uint32_t left;
865 uint64_t seed;
866 uint32_t right;
867 };
868
869 // The second element of these pairs is the left (or right) side of a splice
870 // seed from a possible junction. The first element is the sequence flanking
871 // that seed
872 typedef pair<uint32_t, uint32_t> PackedSpliceHalf;
873
874 static inline std::string u32ToDna(uint32_t a, int len) {
875 char buf[17];
876 assert(len <= 16);
877 for(int i = 0; i < len; i++) {
878 buf[len-i-1] = "ACGT"[a & 3];
879 a >>= 2;
880 }
881 buf[len] = '\0';
882 return std::string(buf);
883 }
884
885
886 PackedSpliceHalf pack_left_splice_half(const string& seq,
887 uint32_t pos_in_l,
888 unsigned int seq_key_len)
889 {
890 const char* l = seq.c_str();
891 l += pos_in_l;
892
893 const char* left_end = l;
894 l -= 16;
895
896 assert (l + seq_key_len < seq.c_str() + seq.length());
897
898 PackedSpliceHalf packed_half = make_pair(0u,0u);
899
900 // Pack up to 32 bits (16 bases) of sequence into left
901 if (l < seq.c_str())
902 l = seq.c_str();
903 while (l < left_end)
904 {
905 packed_half.first <<= 2;
906 packed_half.first |= (0x3 & charToDna5[(size_t)*l]);
907 ++l;
908 }
909
910 // Pack up the seed bits
911 for (unsigned int i = 0; i < seq_key_len; ++i)
912 {
913 packed_half.second <<= 2;
914 packed_half.second |= (0x3 & charToDna5[(size_t)*(l + i)]);
915 }
916 return packed_half;
917 }
918
919
920 PackedSpliceHalf pack_right_splice_half(const string& seq,
921 uint32_t pos,
922 unsigned int seq_key_len)
923 {
924 const char* r = seq.c_str();
925 r += pos - seq_key_len;
926
927 PackedSpliceHalf packed_half = make_pair(0u,0u);
928
929 // Pack the seed bits
930 for (unsigned int i = 0; i < seq_key_len; ++i)
931 {
932 packed_half.second <<= 2;
933 packed_half.second |= (0x3 & charToDna5[(size_t)*(r + i)]);
934 }
935
936 r += seq_key_len;
937 // Now pack 32 bits (16 bases) of sequence into left
938 const char* right_end = r + 16;
939
940 if ((size_t)(right_end - seq.c_str()) > seq.length())
941 right_end = seq.c_str() + seq.length();
942 while (r < right_end)
943 {
944 packed_half.first <<= 2;
945 packed_half.first |= (0x3 & charToDna5[(size_t)*r]);
946 ++r;
947 }
948
949 return packed_half;
950 }
951
952 PackedSplice combine_splice_halves(const PackedSpliceHalf& left_half,
953 const PackedSpliceHalf& right_half,
954 int seq_key_len)
955 {
956 uint64_t seed = left_half.second << (seq_key_len << 1) | right_half.second;
957 return PackedSplice(left_half.first,seed, right_half.first);
958 }
959
960 PackedSplice pack_splice(const string& seq,
961 int l_pos_in_seq,
962 int r_pos_in_seq,
963 unsigned int seq_key_len)
964 {
965 const char* l = seq.c_str(); // l points to beginning of left exon sequence
966 l += l_pos_in_seq;
967
968 assert (l + seq_key_len < seq.c_str() + seq.length());
969
970 const char* r = seq.c_str(); // r points to beginning of right exon sequence
971 r += r_pos_in_seq - seq_key_len;
972 //r += 2; // r points to right side of junction;
973
974 uint64_t seed = 0;
975 uint64_t left = 0;
976 uint64_t right = 0;
977
978 // Pack up the seed bits
979 for (unsigned int i = 0; i < seq_key_len; ++i)
980 {
981 seed <<= 2;
982 seed |= (0x3 & charToDna5[(size_t)*(l + i)]);
983 }
984
985 for (unsigned int i = 0; i < seq_key_len; ++i)
986 {
987 seed <<= 2;
988 seed |= (0x3 & charToDna5[(size_t)*(r + i)]);
989 }
990
991 // Now pack 32 bits (16 bases) of sequence into left
992 const char* left_end = l;
993 l -= 16;
994 if (l < seq.c_str())
995 l = seq.c_str();
996 while (l < left_end)
997 {
998 left <<= 2;
999 left |= (0x3 & charToDna5[(size_t)*l]);
1000 ++l;
1001 }
1002
1003 r += seq_key_len;
1004 // Now pack 32 bits (16 bases) of sequence into left
1005 const char* right_end = r + 16;
1006
1007 if ((size_t)(right_end - seq.c_str()) > seq.length())
1008 right_end = seq.c_str() + seq.length();
1009 while (r < right_end)
1010 {
1011 right <<= 2;
1012 right |= (0x3 & charToDna5[(size_t)*r]);
1013 ++r;
1014 }
1015
1016 return PackedSplice((uint32_t)left,seed,(uint32_t)right);
1017 }
1018
1019
1020
1021 /* Represents a hit between a splice seed and a read. */
1022 // TODO: consider packing pos and meta into a single 32-bit int.
1023 struct ReadHit
1024 {
1025 ReadHit(uint32_t l, uint32_t r, uint32_t p, uint32_t m, bool rc)
1026 : left(l), right(r), pos(p), meta(m), reverse_complement(rc) {}
1027 uint32_t left; // 2-bits per base rep of the left remainder
1028 uint32_t right; //2-bits per base rep of the right remainder
1029 uint32_t pos; // position of the seed within the read
1030 uint32_t meta : 31;
1031 bool reverse_complement : 1;
1032 } __attribute__((packed));
1033
1034 // A MerTable maps k-mers to hits in indexed reads. See the comment for
1035 // mer_table
1036 typedef vector<ReadHit> ReadHitList;
1037 typedef vector<ReadHitList> MerTable;
1038
1039 size_t index_read(MerTable* mer_table,
1040 int seq_key_len,
1041 const string& seq,
1042 unsigned int read_num,
1043 bool reverse_complement,
1044 vector<uint64_t>& seeds)
1045 {
1046 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
1047 // this read.
1048
1049 uint64_t seed = 0;
1050 bitset<256> left = 0;
1051 bitset<256> right = 0;
1052 const char* p = seq.c_str();
1053
1054 unsigned int seq_len = (int)seq.length();
1055 const char* seq_end = p + seq_len;
1056
1057 // Build the first seed
1058 while (p < seq.c_str() + (2 * seq_key_len))
1059 {
1060 seed <<= 2;
1061 seed |= (0x3 & charToDna5[(size_t)*p]);
1062 ++p;
1063 }
1064
1065 seeds.push_back(seed);
1066
1067 // Build the rest of them with a sliding window, adding successive bases
1068 // to the "right-remainder" word.
1069 while (p < seq_end)
1070 {
1071
1072 right <<= 2;
1073 right |= (0x3 & charToDna5[(size_t)*p]);
1074 ++p;
1075 }
1076
1077 size_t cap_increase = 0;
1078
1079 // At this point, seed contains the 5'-most 2*min_anchor_len bases of the
1080 // read, and right contains everthing else on the 3' end.
1081
1082 // This loop will construct successive seed, along with 32-bit words
1083 // containing the left and right remainders for each seed
1084 uint32_t i = 0;
1085 size_t new_hits = 0;
1086 do
1087 {
1088 // Let's not make an out-of-bounds write, if this fails the global
1089 // mer_table is too small
1090 assert (!mer_table || seed < mer_table->size());
1091
1092 // How many base pairs exist in the right remainder beyond what we have
1093 // space for ?
1094 int extra_right_bp = ((int)seq.length() - (i + 2 * seq_key_len)) - 16;
1095
1096 uint32_t hit_right;
1097 if (extra_right_bp > 0)
1098 {
1099 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
1100 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
1101 }
1102 else
1103 {
1104 hit_right = (uint32_t)right.to_ulong();
1105 }
1106
1107 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
1108
1109 if (mer_table)
1110 {
1111 size_t prev_cap = (*mer_table)[seed].capacity();
1112 (*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
1113 cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
1114 }
1115 new_hits++;
1116
1117 // Take the leftmost base of the seed and stick it into bp
1118 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
1119
1120 // Move that base down to the least significant bits of bp
1121 bp >>= ((seq_key_len << 2) - 2);
1122
1123 // And tack it onto the left remainder of the read
1124 left <<= 2;
1125 left |= bp;
1126
1127 // Now take the leftmost base of the right remainder and stick it into
1128 // the rightmost position of the seed
1129 uint32_t right_len = seq_len - (i + seq_key_len * 2);
1130 //bp = right & (0x3uLL << ((right_len - 1) << 1));
1131
1132 seed <<= 2;
1133 //cout << right << endl;
1134 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
1135 //cout <<tmp_right << endl;
1136 seed |= tmp_right.to_ulong();
1137 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
1138
1139 seeds.push_back(seed);
1140
1141 //Now remove that leftmost base of the right remainder
1142 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
1143 if (right_len)
1144 {
1145 right.set(((right_len - 1) << 1), 0);
1146 right.set(((right_len - 1) << 1) + 1, 0);
1147 }
1148 ++i;
1149
1150 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
1151 return cap_increase;
1152 }
1153
1154
1155
1156 struct SeedExtension
1157 {
1158 SeedExtension(int lp,
1159 int rp,
1160 int rd_p,
1161 int le,
1162 int re,
1163 int mm) :
1164 l_pos_in_ref(lp),
1165 r_pos_in_ref(rp),
1166 read_pos(rd_p),
1167 left_extent(le),
1168 right_extent(re),
1169 mismatches(mm){}
1170
1171 int l_pos_in_ref;
1172 int r_pos_in_ref;
1173 int read_pos;
1174 int left_extent;
1175 int right_extent;
1176 int mismatches;
1177 };
1178
1179 pair<string::size_type, int> left_extend(const string& ref,
1180 const string& read,
1181 int ref_pos,
1182 int read_pos,
1183 int num_mismatches)
1184 {
1185 string::size_type ext = 0;
1186 int mm_encountered = 0;
1187 while(ref_pos >= 0 &&
1188 read_pos >= 0)
1189 {
1190 //char ref_char = ref[ref_pos];
1191 //char read_char = read[read_pos];
1192 if (ref[ref_pos] != read[read_pos])
1193 {
1194 if (mm_encountered + 1 > num_mismatches)
1195 return make_pair(ext, mm_encountered);
1196 mm_encountered++;
1197 }
1198 ext++;
1199
1200 --ref_pos;
1201 --read_pos;
1202 }
1203
1204 return make_pair(ext, mm_encountered);
1205 }
1206
1207 pair<string::size_type, int> right_extend(const string& ref,
1208 const string& read,
1209 int ref_pos,
1210 int read_pos,
1211 int num_mismatches)
1212 {
1213 string::size_type ext = 0;
1214 int mm_encountered = 0;
1215 while(ref_pos < (int)ref.size() &&
1216 read_pos < (int)read.size())
1217 {
1218 if (ref[ref_pos] != read[read_pos])
1219 {
1220 if (mm_encountered + 1 > num_mismatches)
1221 return make_pair(ext, mm_encountered);
1222 mm_encountered++;
1223 }
1224
1225 ext++;
1226
1227 ++ref_pos;
1228 ++read_pos;
1229 }
1230
1231 return make_pair(ext, mm_encountered);
1232 }
1233
1234
1235 void extend_from_seeds(vector<SeedExtension>& extensions,
1236 const PackedSplice& p,
1237 const MerTable& mer_table,
1238 const string& ref,
1239 const string& read,
1240 size_t l_pos_in_ref,
1241 size_t r_pos_in_ref,
1242 int seq_key_len)
1243 {
1244 assert(p.seed < mer_table.size());
1245 const ReadHitList& hl = mer_table[p.seed];
1246
1247 for (size_t hit = 0; hit < hl.size(); ++hit)
1248 {
1249 const ReadHit& rh = hl[hit];
1250 uint32_t pos = rh.pos;
1251
1252 pair<string::size_type, int> left_extension;
1253 pair<string::size_type, int> right_extension;
1254 left_extension = left_extend(ref, read, l_pos_in_ref - seq_key_len + 1, pos, 2);
1255 right_extension = right_extend(ref, read, r_pos_in_ref + seq_key_len, pos + 2 * seq_key_len, 2);
1256 extensions.push_back(SeedExtension(l_pos_in_ref,
1257 r_pos_in_ref,
1258 pos + seq_key_len,
1259 left_extension.first,
1260 right_extension.first,
1261 left_extension.second + right_extension.second));
1262
1263 }
1264 }
1265
1266 typedef pair<size_t, PackedSpliceHalf> SpliceHalf;
1267
1268 void get_seed_extensions(const string& ref,
1269 const string& read,
1270 int seq_key_len,
1271 MerTable& mer_table,
1272 vector<SpliceHalf>& donors,
1273 vector<SpliceHalf>& acceptors,
1274 vector<SeedExtension>& extensions)
1275 {
1276 for (size_t d = 0; d < donors.size(); ++d)
1277 {
1278 bool broke_out = false;
1279
1280 // start pos is a lower bound on downstream acceptor positions
1281 // to consider
1282 size_t start_pos = donors[d].first + min_report_intron_length;
1283
1284 SpliceHalf dummy = make_pair(start_pos,PackedSpliceHalf());
1285 vector<SpliceHalf>::iterator lb = upper_bound(acceptors.begin(),
1286 acceptors.end(),
1287 dummy);
1288
1289 if (lb == acceptors.end())
1290 break;
1291 for (size_t a = lb - acceptors.begin();
1292 a < acceptors.size();
1293 ++a)
1294 {
1295 if (acceptors[a].first - donors[d].first > (size_t)max_microexon_stretch)
1296 {
1297 broke_out = true;
1298 break;
1299 }
1300
1301 size_t l_pos_in_ref = donors[d].first - 1;
1302 size_t r_pos_in_ref = acceptors[a].first + 2;
1303 PackedSplice p = combine_splice_halves(donors[d].second,
1304 acceptors[a].second,
1305 seq_key_len);
1306 extend_from_seeds(extensions,
1307 p,
1308 mer_table,
1309 ref,
1310 read,
1311 l_pos_in_ref,
1312 r_pos_in_ref,
1313 seq_key_len);
1314 }
1315 if (broke_out)
1316 continue;
1317 }
1318
1319 }
1320
1321 void hits_from_seed_extension(uint32_t ref_id,
1322 int ref_offset,
1323 uint64_t insert_id,
1324 bool antisense,
1325 vector<SeedExtension>& extensions,
1326 vector<BowtieHit>& hits_out,
1327 int left_read_edge,
1328 int right_read_edge,
1329 int seq_key_len)
1330 {
1331 for (size_t i = 0; i < extensions.size(); ++i)
1332 {
1333 SeedExtension& s = extensions[i];
1334 if (s.read_pos >= right_read_edge ||
1335 s.read_pos < left_read_edge)
1336 continue;
1337 if (s.read_pos - seq_key_len - s.left_extent <= left_read_edge &&
1338 s.read_pos + seq_key_len + s.right_extent >= right_read_edge && s.mismatches <= 2 )
1339 {
1340 vector<CigarOp> cigar;
1341 int off_adjust;
1342 if (antisense)
1343 {
1344 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge);
1345 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1346 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos);
1347 cigar.push_back(m1);
1348 cigar.push_back(skip);
1349 cigar.push_back(m2);
1350 off_adjust = m1.length;
1351 }
1352 else
1353 {
1354 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge + 1);
1355 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1356 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos - 1);
1357 cigar.push_back(m1);
1358 cigar.push_back(skip);
1359 cigar.push_back(m2);
1360 off_adjust = m1.length;
1361 }
1362 // daehwan - check this
1363 bool end = false;
1364 BowtieHit bh(ref_id,
1365 insert_id,
1366 ref_offset + s.l_pos_in_ref - off_adjust + 1,
1367 cigar,
1368 antisense,
1369 false,
1370 s.mismatches,
1371 0,
1372 end);
1373 hits_out.push_back(bh);
1374 }
1375 }
1376 }
1377
1378 void align(uint32_t ref_id,
1379 uint64_t insert_id,
1380 bool antisense,
1381 const string& ref,
1382 const string& read,
1383 int ref_offset,
1384 int left_read_edge,
1385 int right_read_edge,
1386 MerTable& mer_table,
1387 int seq_key_len,
1388 vector<BowtieHit>& hits_out)
1389 {
1390 // Reserve an entry for each k-mer we might see
1391 size_t mer_table_size = 1 << ((seq_key_len << 1)<<1);
1392
1393 mer_table.resize(mer_table_size);
1394
1395 vector<uint64_t> seeds;
1396 index_read(&mer_table, seq_key_len, read, 0, antisense, seeds);
1397
1398 vector<SpliceHalf> forward_donors;
1399 vector<SpliceHalf> forward_acceptors;
1400 vector<SpliceHalf> reverse_donors;
1401 vector<SpliceHalf> reverse_acceptors;
1402
1403 const string& seq = ref;
1404
1405 unsigned int pos = 0;
1406
1407
1408 for (size_t z = seq_key_len + 1; z < seq.length() - seq_key_len - 2; ++z)
1409 {
1410 char l = seq[z - 1];
1411 char r = seq[z];
1412 if (l == 'G' && r == 'T')
1413 {
1414 size_t donor_pos = pos + z - 1;
1415 size_t s = donor_pos - seq_key_len;
1416 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1417 forward_donors.push_back(make_pair(donor_pos,p));
1418 }
1419 if (l == 'A' && r == 'G')
1420 {
1421 size_t acceptor_pos = pos + z - 1;
1422 size_t s = acceptor_pos + 2 + seq_key_len;
1423 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1424 forward_acceptors.push_back(make_pair(acceptor_pos,p));
1425 }
1426 if (l == 'C' && r == 'T')
1427 {
1428 size_t acceptor_pos = pos + z - 1;
1429 size_t s = acceptor_pos - seq_key_len;
1430 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1431 reverse_acceptors.push_back(make_pair(pos + z - 1,p));
1432 }
1433 if (l == 'A' && r == 'C')
1434 {
1435 size_t donor_pos = pos + z - 1;
1436 size_t s = donor_pos + 2 + seq_key_len;
1437 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1438 reverse_donors.push_back(make_pair(donor_pos,p));
1439 }
1440 }
1441
1442 vector<SeedExtension> fwd_extensions;
1443 get_seed_extensions(seq,
1444 read,
1445 seq_key_len,
1446 mer_table,
1447 forward_donors,
1448 forward_acceptors,
1449 fwd_extensions);
1450
1451 hits_from_seed_extension(ref_id,
1452 ref_offset,
1453 insert_id,
1454 antisense,
1455 fwd_extensions,
1456 hits_out,
1457 left_read_edge,
1458 right_read_edge,
1459 seq_key_len);
1460
1461 //fprintf(stderr, "Found %d seed hits\n", fwd_extensions.size());
1462
1463 vector<SeedExtension> rev_extensions;
1464 get_seed_extensions(seq,
1465 read,
1466 seq_key_len,
1467 mer_table,
1468 reverse_donors,
1469 reverse_acceptors,
1470 rev_extensions);
1471
1472 hits_from_seed_extension(ref_id,
1473 ref_offset,
1474 insert_id,
1475 antisense,
1476 rev_extensions,
1477 hits_out,
1478 left_read_edge,
1479 right_read_edge,
1480 seq_key_len);
1481
1482 for (size_t i = 0; i < seeds.size(); ++i)
1483 mer_table[seeds[i]].clear();
1484 }
1485
1486 int extension_mismatches = 0;
1487
1488
1489 bool left_extendable_junction(uint64_t upstream_dna_str,
1490 size_t key,
1491 size_t splice_mer_len,
1492 size_t min_ext_len)
1493 {
1494 vector<MerExtension>& exts = extensions[key];
1495 for (size_t i = 0; i < exts.size(); ++i)
1496 {
1497 const MerExtension& ext = exts[i];
1498 if (ext.left_ext_len < min_ext_len)
1499 continue;
1500 uint64_t upstream = upstream_dna_str & ~(0xFFFFFFFFFFFFFFFFull << (ext.left_ext_len << 1));
1501 int mism = mismatching_bases(ext.left_dna_str, upstream, ext.left_ext_len, extension_mismatches);
1502 if (mism <= extension_mismatches)
1503 return true;
1504 }
1505 return false;
1506 }
1507
1508 bool right_extendable_junction(uint64_t downstream_dna_str,
1509 size_t key,
1510 size_t splice_mer_len,
1511 size_t min_ext_len)
1512 {
1513 vector<MerExtension>& exts = extensions[key];
1514 for (size_t i = 0; i < exts.size(); ++i)
1515 {
1516 const MerExtension& ext = exts[i];
1517 if (ext.right_ext_len < min_ext_len)
1518 continue;
1519 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull >> (ext.right_ext_len << 1));
1520 uint64_t downstream = downstream_dna_str & mask;
1521 downstream >>= ((32 - ext.right_ext_len) << 1);
1522 int mism = mismatching_bases(ext.right_dna_str, downstream, ext.right_ext_len, extension_mismatches);
1523
1524 if (mism <= extension_mismatches)
1525 return true;
1526 }
1527 return false;
1528 }
1529
1530 uint32_t junction_key(uint64_t upstream_dna_str,
1531 uint64_t downstream_dna_str,
1532 size_t splice_mer_len)
1533 {
1534 uint64_t upstream_mask = ~(0xFFFFFFFFFFFFFFFFull << (splice_mer_len));
1535 uint64_t upstream_key_half = upstream_dna_str & upstream_mask;
1536 uint64_t downstream_mask = ~(0xFFFFFFFFFFFFFFFFull >> (splice_mer_len));
1537 uint64_t downstream_key_half = (downstream_dna_str & downstream_mask) >> (64 - splice_mer_len);
1538 uint32_t key = ((uint32_t)upstream_key_half << splice_mer_len) | (uint32_t)downstream_key_half;
1539 return key;
1540 }
1541
1542 bool extendable_junction(uint64_t upstream_dna_str,
1543 uint64_t downstream_dna_str,
1544 size_t splice_mer_len,
1545 size_t min_ext_len,
1546 bool reverse,
1547 char last_in_upstream = 'N',
1548 char first_in_downstream = 'N')
1549 {
1550 if (color)
1551 {
1552 string two_bp; two_bp.push_back(last_in_upstream); two_bp.push_back(first_in_downstream);
1553 string color = convert_bp_to_color(two_bp, true);
1554 char num = (color[0] - '0') & 0x3;
1555
1556 if (reverse)
1557 {
1558 upstream_dna_str = (upstream_dna_str >> 2) << 2;
1559 upstream_dna_str |= (uint64_t)num;
1560 }
1561 else
1562 {
1563 downstream_dna_str = (downstream_dna_str << 2) >> 2;
1564 downstream_dna_str |= ((uint64_t)num << 62);
1565 }
1566 }
1567
1568 uint32_t key = junction_key(upstream_dna_str,
1569 downstream_dna_str,
1570 splice_mer_len);
1571
1572 upstream_dna_str >>= splice_mer_len;
1573 downstream_dna_str <<= splice_mer_len;
1574
1575 bool extendable = (left_extendable_junction(upstream_dna_str,
1576 key, splice_mer_len, min_ext_len) ||
1577 right_extendable_junction(downstream_dna_str,
1578 key, splice_mer_len, min_ext_len));
1579 return extendable;
1580 }
1581
1582 typedef std::set<Junction, skip_count_lt> PotentialJuncs;
1583
1584 struct RecordExtendableJuncs
1585 {
1586 void record(uint32_t ref_id,
1587 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1588 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1589 bool antisense,
1590 PotentialJuncs& juncs,
1591 int min_intron,
1592 int max_intron,
1593 size_t max_juncs,
1594 size_t half_splice_mer_len)
1595 {
1596
1597 size_t splice_mer_len = 2 * half_splice_mer_len;
1598
1599 size_t curr_R = 0;
1600 for (size_t L = 0; L < left_sites.size(); ++L)
1601 {
1602 while (curr_R < right_sites.size() &&
1603 right_sites[curr_R].first < left_sites[L].first + min_intron)
1604 {
1605 curr_R++;
1606 }
1607
1608 size_t left_pos = left_sites[L].first;
1609 size_t max_right_pos = left_pos + max_intron;
1610 for (size_t R = curr_R;
1611 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1612 {
1613 uint64_t upstream_dna_str = left_sites[L].second.fwd_string;
1614 char last_in_upstream = left_sites[L].second.last_in_string;
1615 uint64_t downstream_dna_str = right_sites[R].second.fwd_string;
1616 char first_in_downstream = right_sites[R].second.first_in_string;
1617 uint64_t rc_upstream_dna_str = left_sites[L].second.rev_string;
1618 uint64_t rc_downstream_dna_str = right_sites[R].second.rev_string;
1619
1620 if (extendable_junction(upstream_dna_str,
1621 downstream_dna_str, splice_mer_len, 7, false,
1622 last_in_upstream, first_in_downstream) ||
1623 extendable_junction(rc_downstream_dna_str,
1624 rc_upstream_dna_str, splice_mer_len, 7, true,
1625 last_in_upstream, first_in_downstream))
1626 {
1627 juncs.insert(Junction(ref_id,
1628 left_sites[L].first - 1,
1629 right_sites[R].first + 2,
1630 antisense,
1631 R - curr_R));
1632 }
1633 if (juncs.size() > max_juncs)
1634 juncs.erase(*(juncs.rbegin()));
1635 }
1636 }
1637 }
1638 };
1639
1640 struct RecordAllJuncs
1641 {
1642 void record(uint32_t ref_id,
1643 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1644 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1645 bool antisense,
1646 PotentialJuncs& juncs,
1647 int min_intron,
1648 int max_intron,
1649 size_t max_juncs,
1650 size_t half_splice_mer_len)
1651 {
1652 size_t curr_R = 0;
1653 for (size_t L = 0; L < left_sites.size(); ++L)
1654 {
1655 while (curr_R < right_sites.size() &&
1656 right_sites[curr_R].first < left_sites[L].first + min_intron)
1657 {
1658 curr_R++;
1659 }
1660
1661 size_t left_pos = left_sites[L].first;
1662 size_t max_right_pos = left_pos + max_intron;
1663 for (size_t R = curr_R;
1664 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1665 {
1666 Junction j(ref_id,
1667 left_sites[L].first - 1,
1668 right_sites[R].first + 2,
1669 antisense,
1670 R - curr_R);
1671
1672 juncs.insert(j);
1673
1674 if (juncs.size() > max_juncs)
1675 juncs.erase(*(juncs.rbegin()));
1676 }
1677 }
1678 }
1679 };
1680
1681 struct RecordSegmentJuncs
1682 {
1683 void record(uint32_t ref_id,
1684 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1685 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1686 bool antisense,
1687 PotentialJuncs& juncs,
1688 int min_intron,
1689 int max_intron,
1690 size_t max_juncs,
1691 size_t half_splice_mer_len)
1692 {
1693 if (left_sites.size() != right_sites.size())
1694 return;
1695
1696 for (size_t i = 0; i < left_sites.size(); ++i)
1697 {
1698 Junction j(ref_id,
1699 left_sites[i].first - 1,
1700 right_sites[i].first + 2,
1701 antisense);
1702
1703 juncs.insert(j);
1704
1705 if (juncs.size() > max_juncs)
1706 juncs.erase(*(juncs.rbegin()));
1707 }
1708 }
1709 };
1710
1711 struct ButterflyKey
1712 {
1713 uint32_t pos;
1714 uint32_t key;
1715
1716 ButterflyKey(uint32_t p, uint32_t k) : pos(p), key(k) {}
1717
1718 bool operator<(const ButterflyKey& rhs) const
1719 {
1720 if (key != rhs.key)
1721 return key < rhs.key;
1722 if (pos != rhs.pos)
1723 return pos < rhs.pos;
1724 return false;
1725 }
1726
1727 bool operator==(const ButterflyKey& rhs) const
1728 {
1729 return pos == rhs.pos && key == rhs.key;
1730 }
1731 };
1732
1733 uint32_t get_left_butterfly_key(uint64_t upstream_key,
1734 const MerExtension& ext,
1735 size_t half_splice_mer_len)
1736 {
1737 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - half_splice_mer_len) << 1);
1738 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1739 uint64_t top_half = upstream_key & mask;
1740 key |= (top_half << (half_splice_mer_len << 1));
1741 return (uint32_t)key;
1742 }
1743
1744 uint32_t get_right_butterfly_key(uint64_t downstream_key,
1745 const MerExtension& ext,
1746 size_t half_splice_mer_len)
1747 {
1748 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1749 uint64_t key = (ext.left_dna_str & mask) << (half_splice_mer_len << 1);
1750 uint64_t bottom_half = (downstream_key >> (half_splice_mer_len << 1));
1751 key |= bottom_half;
1752 return (uint32_t)key;
1753 }
1754
1755 struct RecordButterflyJuncs
1756 {
1757 void record(uint32_t ref_id,
1758 const vector<pair<size_t, DnaSpliceStrings> >& all_left_sites,
1759 const vector<pair<size_t, DnaSpliceStrings> >& all_right_sites,
1760 bool antisense,
1761 PotentialJuncs& juncs,
1762 int min_intron,
1763 int max_intron,
1764 size_t max_juncs,
1765 size_t half_splice_mer_len)
1766 {
1767 size_t key_length = 2 * half_splice_mer_len;
1768 size_t extension_length = butterfly_overhang;
1769 uint64_t bottom_bit_mask = ~(0xFFFFFFFFFFFFFFFFull << (key_length<<1));
1770 uint64_t top_bit_mask = ~(0xFFFFFFFFFFFFFFFFull >> (key_length<<1));
1771
1772 if (all_left_sites.empty() || all_right_sites.empty())
1773 return;
1774
1775 size_t last_site = max(all_left_sites.back().first,
1776 all_right_sites.back().first);
1777
1778 size_t curr_left_site = 0;
1779 size_t curr_right_site = 0;
1780 for (size_t window_left_edge = 0;
1781 window_left_edge < last_site;
1782 window_left_edge += max_intron)
1783 {
1784 //fprintf(stderr, "\twindow %lu - %lu\n", window_left_edge, window_left_edge + 2 * max_intron);
1785 vector<pair<size_t, DnaSpliceStrings> > left_sites;
1786 vector<pair<size_t, DnaSpliceStrings> > right_sites;
1787
1788 while(curr_left_site < all_left_sites.size() &&
1789 all_left_sites[curr_left_site].first < window_left_edge)
1790 {
1791 curr_left_site++;
1792 }
1793
1794 while(curr_right_site < all_right_sites.size() &&
1795 all_right_sites[curr_right_site].first < window_left_edge)
1796 {
1797 curr_right_site++;
1798 }
1799
1800 for (size_t ls = curr_left_site; ls < all_left_sites.size(); ++ls)
1801 {
1802 if (all_left_sites[ls].first < window_left_edge + 2 * max_intron)
1803 {
1804 left_sites.push_back(all_left_sites[ls]);
1805 }
1806 }
1807
1808 for (size_t rs = curr_right_site; rs < all_right_sites.size(); ++rs)
1809 {
1810 if (all_right_sites[rs].first < window_left_edge + 2 * max_intron)
1811 {
1812 right_sites.push_back(all_right_sites[rs]);
1813 }
1814 }
1815
1816 vector<ButterflyKey> left_keys;
1817 for (size_t L = 0; L < left_sites.size(); ++L)
1818 {
1819 uint64_t fwd_upstream_dna_str = left_sites[L].second.fwd_string;
1820 uint64_t fwd_upstream_key = fwd_upstream_dna_str & bottom_bit_mask;
1821
1822 assert (fwd_upstream_key < extensions.size());
1823
1824 vector<MerExtension>& fwd_exts = extensions[fwd_upstream_key];
1825 for (size_t i = 0; i < fwd_exts.size(); ++i)
1826 {
1827 const MerExtension& ext = fwd_exts[i];
1828 if (ext.right_ext_len < extension_length)
1829 continue;
1830
1831 /*
1832 < f_u_key ><ext.right>
1833 NNNNNNNNNN GT
1834 */
1835
1836 // take the top bits of the right extension
1837 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - extension_length) << 1);
1838
1839 // and the bottom bits of the site key
1840 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1841 uint64_t top_half = fwd_upstream_key & mask;
1842
1843 // and cat them together
1844 key |= (top_half << (extension_length << 1));
1845 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1846 }
1847
1848 uint64_t rev_upstream_dna_str = left_sites[L].second.rev_string;
1849 uint64_t rev_upstream_key = (rev_upstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1850
1851 assert (rev_upstream_key < extensions.size());
1852
1853 vector<MerExtension>& rev_exts = extensions[rev_upstream_key];
1854 for (size_t i = 0; i < rev_exts.size(); ++i)
1855 {
1856 const MerExtension& ext = rev_exts[i];
1857 if (ext.left_ext_len < extension_length)
1858 continue;
1859
1860 /*
1861 < r_u_key ><ext.left>
1862 NNNNNNNNNN GT
1863 */
1864
1865 // reverse complement the left extension, and we will need
1866 // what were the bottom bits. these become the top bits in the
1867 // rc.
1868 uint64_t ext_str = color ? rc_color_str(ext.left_dna_str) : rc_dna_str(ext.left_dna_str);
1869 ext_str >>= 64 - (ext.left_ext_len << 1);
1870
1871 // now take the top bits of the rc, make them the bottom of
1872 // the key
1873 uint64_t key = ext_str >> ((ext.left_ext_len - extension_length) << 1);
1874
1875 // now add in the seed key bottom bits, making them the top of
1876 // the key
1877 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1878 uint64_t top_half = fwd_upstream_key & mask;
1879 key |= (top_half << (extension_length << 1));
1880 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1881 }
1882 }
1883 sort (left_keys.begin(), left_keys.end());
1884 vector<ButterflyKey>::iterator new_end = unique(left_keys.begin(), left_keys.end());
1885 left_keys.erase(new_end, left_keys.end());
1886
1887 vector<ButterflyKey> right_keys;
1888 for (size_t R = 0; R < right_sites.size(); ++R)
1889 {
1890 uint64_t fwd_downstream_dna_str = right_sites[R].second.fwd_string;
1891 uint64_t fwd_downstream_key = (fwd_downstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1892
1893 assert (fwd_downstream_key < extensions.size());
1894
1895 vector<uint64_t> fwd_downstream_keys;
1896 if (color)
1897 {
1898 for(size_t color_value = 0; color_value < 4; ++color_value)
1899 {
1900 uint64_t tmp_key = (fwd_downstream_key << 2) >> 2 | (color_value << ((key_length - 1) << 1));
1901 fwd_downstream_keys.push_back(tmp_key);
1902 }
1903 }
1904 else
1905 {
1906 fwd_downstream_keys.push_back(fwd_downstream_key);
1907 }
1908
1909 for(size_t key = 0; key < fwd_downstream_keys.size(); ++key)
1910 {
1911 uint64_t tmp_fwd_downstream_key = fwd_downstream_keys[key];
1912 vector<MerExtension>& fwd_exts = extensions[tmp_fwd_downstream_key];
1913 for (size_t i = 0; i < fwd_exts.size(); ++i)
1914 {
1915 const MerExtension& ext = fwd_exts[i];
1916 if (ext.left_ext_len < extension_length)
1917 continue;
1918
1919 /*
1920 <ext.left>< f_d_key >
1921 AG NNNNNNNNNN
1922 */
1923
1924 // take the bottom bits of the left extension, making them the
1925 // top of the key.
1926 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1927 uint64_t key = (ext.left_dna_str & mask) << (extension_length << 1);
1928
1929 // add in the top bits of the seed key, making them the bottom bits
1930 // of the key.
1931 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1932 key |= bottom_half;
1933 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1934 }
1935 }
1936
1937 uint64_t rev_downstream_dna_str = right_sites[R].second.rev_string;
1938 uint64_t rev_downstream_key = rev_downstream_dna_str & bottom_bit_mask;
1939
1940 assert (rev_downstream_key < extensions.size());
1941
1942 vector<uint64_t> rev_downstream_keys;
1943 if (color)
1944 {
1945 for(size_t color_value = 0; color_value < 4; ++color_value)
1946 {
1947 uint64_t tmp_key = (rev_downstream_key >> 2) << 2 | color_value;
1948 rev_downstream_keys.push_back(tmp_key);
1949 }
1950 }
1951 else
1952 {
1953 rev_downstream_keys.push_back(rev_downstream_key);
1954 }
1955
1956 for(size_t key = 0; key < rev_downstream_keys.size(); ++key)
1957 {
1958 uint64_t tmp_rev_downstream_key = rev_downstream_keys[key];
1959 uint64_t tmp_fwd_downstream_key = fwd_downstream_key;
1960 if (color)
1961 {
1962 tmp_fwd_downstream_key = rc_color_str(tmp_rev_downstream_key) >> (64 - (key_length << 1));
1963 }
1964
1965 vector<MerExtension>& rev_exts = extensions[tmp_rev_downstream_key];
1966 for (size_t i = 0; i < rev_exts.size(); ++i)
1967 {
1968 const MerExtension& ext = rev_exts[i];
1969 if (ext.right_ext_len < extension_length)
1970 continue;
1971
1972 /*
1973 <ext.right>< r_d_key >
1974 AG NNNNNNNNNN
1975 */
1976
1977 // reverse complement the right_extension. we want the
1978 // top bits of the extension, but these become the bottom bits
1979 // under the rc.
1980 uint64_t ext_str = color ? rc_color_str(ext.right_dna_str) : rc_dna_str(ext.right_dna_str);
1981 ext_str >>= 64 - (ext.right_ext_len << 1);
1982
1983 // take the bottom bits of the rc and make it the top of the key
1984 uint64_t key = ext_str << (extension_length << 1);
1985
1986 // take the top bits of the seed key and make them the bottom
1987 // of the key.
1988 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1989 key |= bottom_half;
1990
1991 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1992 }
1993 }
1994 }
1995
1996 sort (right_keys.begin(), right_keys.end());
1997 new_end = unique(right_keys.begin(), right_keys.end());
1998 right_keys.erase(new_end, right_keys.end());
1999
2000 size_t lk = 0;
2001 size_t rk = 0;
2002
2003 while (lk < left_keys.size() && rk < right_keys.size())
2004 {
2005 while (lk < left_keys.size() &&
2006 left_keys[lk].key < right_keys[rk].key) { ++lk; }
2007
2008 if (lk == left_keys.size())
2009 break;
2010
2011 while (rk < right_keys.size() &&
2012 right_keys[rk].key < left_keys[lk].key) { ++rk; }
2013
2014 if (rk == right_keys.size())
2015 break;
2016
2017 if (lk < left_keys.size() && rk < right_keys.size() &&
2018 right_keys[rk].key == left_keys[lk].key)
2019 {
2020
2021 size_t k = right_keys[rk].key;
2022 size_t lk_end = lk;
2023 size_t rk_end = rk;
2024 while (rk_end < right_keys.size() && right_keys[rk_end].key == k) {++rk_end;}
2025 while (lk_end < left_keys.size() && left_keys[lk_end].key == k) {++lk_end;}
2026
2027 size_t tmp_lk = lk;
2028
2029 while (tmp_lk < lk_end)
2030 {
2031 size_t tmp_rk = rk;
2032 while (tmp_rk < rk_end)
2033 {
2034 int donor = (int)left_keys[tmp_lk].pos - 1;
2035 int acceptor = (int)right_keys[tmp_rk].pos + 2;
2036
2037 if (acceptor - donor > min_intron && acceptor - donor < max_intron)
2038 {
2039 Junction j(ref_id,
2040 donor,
2041 acceptor,
2042 antisense,
2043 acceptor - donor); // just prefer shorter introns
2044 juncs.insert(j);
2045 if (juncs.size() > max_juncs)
2046 {
2047 juncs.erase(*(juncs.rbegin()));
2048 }
2049 }
2050 ++tmp_rk;
2051 }
2052
2053 ++tmp_lk;
2054 }
2055
2056 lk = lk_end;
2057 rk = rk_end;
2058 }
2059 }
2060 }
2061 }
2062 };
2063
2064 template <class JunctionRecorder>
2065 void juncs_from_ref_segs(RefSequenceTable& rt,
2066 vector<RefSeg>& expected_don_acc_windows,
2067 PotentialJuncs& juncs,
2068 const DnaString& donor_dinuc,
2069 const DnaString& acceptor_dinuc,
2070 int max_intron,
2071 int min_intron,
2072 size_t max_juncs,
2073 bool talkative,
2074 size_t half_splice_mer_len)
2075 {
2076
2077 typedef map<uint32_t, IntronMotifs> MotifMap;
2078
2079 MotifMap ims;
2080
2081 seqan::DnaStringReverseComplement rev_donor_dinuc(donor_dinuc);
2082 seqan::DnaStringReverseComplement rev_acceptor_dinuc(acceptor_dinuc);
2083
2084 if (talkative)
2085 fprintf(stderr, "Collecting potential splice sites in islands\n");
2086
2087 //
2088 bool all_both = true;
2089 for (size_t r = 0; r < expected_don_acc_windows.size(); ++r)
2090 {
2091 const RefSeg& seg = expected_don_acc_windows[r];
2092
2093 if (seg.points_where != POINT_DIR_BOTH)
2094 all_both = false;
2095
2096 RefSequenceTable::Sequence* ref_str = rt.get_seq(seg.ref_id);
2097
2098 if (!ref_str)
2099 continue;
2100
2101 bool skip_fwd = false;
2102 bool skip_rev = false;
2103
2104 if (library_type == FR_FIRSTSTRAND)
2105 {
2106 if (seg.read == READ_LEFT)
2107 {
2108 if (seg.antisense) skip_rev = true;
2109 else skip_fwd = true;
2110 }
2111 else if(seg.read == READ_RIGHT)
2112 {
2113 if (seg.antisense) skip_fwd = true;
2114 else skip_rev = true;
2115 }
2116 }
2117
2118 if (library_type == FR_SECONDSTRAND)
2119 {
2120 if (seg.read == READ_LEFT)
2121 {
2122 if (seg.antisense) skip_fwd = true;
2123 else skip_rev = true;
2124 }
2125 else if(seg.read == READ_RIGHT)
2126 {
2127 if (seg.antisense) skip_rev = true;
2128 else skip_fwd = true;
2129 }
2130 }
2131
2132 pair<map<uint32_t, IntronMotifs>::iterator, bool> ret =
2133 ims.insert(make_pair(seg.ref_id, IntronMotifs(seg.ref_id)));
2134
2135 IntronMotifs& motifs = ret.first->second;
2136
2137 int left_color_offset = 0, right_color_offset = 0;
2138 if (color)
2139 {
2140 if (seg.antisense)
2141 right_color_offset = 1;
2142 else
2143 left_color_offset = -1;
2144 }
2145
2146 if (seg.left + left_color_offset < 0 || seg.right + right_color_offset >= (int)length(*ref_str) - 1)
2147 continue;
2148
2149 DnaString org_seg_str = seqan::infix(*ref_str, seg.left + left_color_offset, seg.right + right_color_offset);
2150
2151 String<char> seg_str;
2152 assign(seg_str, org_seg_str);
2153
2154 // daehwan
2155 if (bDebug)
2156 {
2157 cout << "coord: " << seg.left << " " << seg.right << endl
2158 << "seg_str: " << seg_str << endl;
2159 }
2160
2161 if (color)
2162 {
2163 bool remove_primer = true;
2164 seg_str = convert_bp_to_color(org_seg_str, remove_primer);
2165 }
2166
2167 size_t to = 0;
2168 size_t seg_len = length(seg_str);
2169 size_t read_len = seg.support_read.size();
2170 if (read_len <= 0)
2171 to = seg_len - 2;
2172 else
2173 to = read_len - 2;
2174
2175 const size_t max_segment_len = 128;
2176 uint8_t left_mismatches[max_segment_len] = {0,};
2177 uint8_t right_mismatches[max_segment_len] = {0,};
2178
2179 if (max_segment_len < read_len)
2180 {
2181 fprintf(stderr, "Error: read len(%d) is greater than %d\n", (int)read_len, (int)max_segment_len);
2182 exit(-1);
2183 }
2184
2185 if (read_len == seg_len || seg.points_where == POINT_DIR_BOTH)
2186 {
2187 if (seg.points_where == POINT_DIR_RIGHT || seg.points_where == POINT_DIR_BOTH)
2188 {
2189 size_t num_mismatches = 0;
2190 for (size_t i = 0; i < read_len - 1; ++i)
2191 {
2192 if (seg_str[i] != seg.support_read[i])
2193 ++num_mismatches;
2194
2195 left_mismatches[i] = num_mismatches;
2196 if (num_mismatches > 2)
2197 {
2198 to = i;
2199 break;
2200 }
2201 }
2202 }
2203
2204 if (seg.points_where == POINT_DIR_LEFT || seg.points_where == POINT_DIR_BOTH)
2205 {
2206 size_t num_mismatches = 0;
2207 for (int i = read_len - 1; i >= 0; --i)
2208 {
2209 if (seg_str[i + (seg_len - read_len)] != seg.support_read[i])
2210 ++num_mismatches;
2211
2212 right_mismatches[i] = num_mismatches;
2213 if (num_mismatches > 2)
2214 break;
2215 }
2216 }
2217
2218 // daehwan
2219 if (bDebug)
2220 {
2221 cout << "antisense: " << (seg.antisense ? "-" : "+") << endl
2222 << seqan::infix(seg_str, 0, segment_length) << " "
2223 << seqan::infix(seg_str, length(seg_str) - segment_length, length(seg_str)) << endl
2224 << seg.support_read << endl
2225 << 0 << " - " << to << endl;
2226
2227 for (unsigned int i = 0; i < read_len; ++i)
2228 cout << (int)left_mismatches[i];
2229 cout << "\t";
2230
2231 for (unsigned int i = 0; i < read_len; ++i)
2232 cout << (int)right_mismatches[i];
2233 cout << endl;
2234 }
2235 }
2236
2237 if (seg.points_where == POINT_DIR_BOTH)
2238 {
2239 for (size_t i = 0; i <= to; ++i)
2240 {
2241 // Look at a slice of the reference without creating a copy.
2242 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2243
2244 if ((!skip_fwd && curr == donor_dinuc) || (!skip_rev && curr == rev_acceptor_dinuc))
2245 {
2246 DnaString partner;
2247 if (curr == donor_dinuc)
2248 partner = acceptor_dinuc;
2249 else
2250 partner = rev_donor_dinuc;
2251
2252 uint8_t left_mismatch = 0;
2253 if (i > 0)
2254 left_mismatch = left_mismatches[i-1];
2255
2256 // daehwan
2257 if (bDebug)
2258 {
2259 cout << "i: " << i << endl
2260 << "mismatches: " << (int)left_mismatch
2261 << " - " << (int)right_mismatches[i] << endl;
2262 }
2263
2264 if (left_mismatch + right_mismatches[i] <= 2)
2265 {
2266 size_t pos = length(seg_str) - (read_len - i) - 2;
2267 if (partner == seqan::infix(org_seg_str, pos - left_color_offset, pos + 2 - left_color_offset))
2268 {
2269 if (curr == donor_dinuc)
2270 {
2271 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2272 motifs.fwd_acceptors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2273 }
2274 else
2275 {
2276 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2277 motifs.rev_donors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2278 }
2279
2280 // daehwan
2281 if (bDebug)
2282 {
2283 cout << curr << ":" << partner << " added" << endl;
2284 }
2285 }
2286 }
2287 }
2288 }
2289 }
2290
2291 else if (seg.points_where == POINT_DIR_LEFT)
2292 {
2293 // A ref segment that "points left" is one that was flanked
2294 // on the right by a partial bowtie hit, indicating that we
2295 // should be looking for an intron to the left of the hit
2296 // In this seg, that means either an "AG" or an "AC"
2297 for (size_t i = 0; i <= to; ++i)
2298 {
2299 // Look at a slice of the reference without creating a copy.
2300 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2301
2302 if (curr == acceptor_dinuc && !skip_fwd)
2303 motifs.fwd_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2304 else if (curr == rev_donor_dinuc && !skip_rev)
2305 motifs.rev_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2306 }
2307 }
2308 else
2309 {
2310 // A right pointing ref seg wants either a "GT" or a "CT"
2311 for (size_t i = 0; i <= to; ++i)
2312 {
2313 // Look at a slice of the reference without creating a copy.
2314 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2315
2316 if (curr == donor_dinuc && !skip_fwd)
2317 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2318 else if (curr == rev_acceptor_dinuc && !skip_rev)
2319 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2320 }
2321 }
2322 }
2323
2324 if (talkative)
2325 {
2326 fprintf(stderr, "reporting synthetic splice junctions...\n");
2327 }
2328 for (MotifMap::iterator motif_itr = ims.begin(); motif_itr != ims.end(); ++motif_itr)
2329 {
2330 uint32_t ref_id = motif_itr->first;
2331
2332 RefSequenceTable::Sequence* ref_str = rt.get_seq(ref_id);
2333
2334 if (!ref_str)
2335 err_die("Error: couldn't get ref string for %u\n", ref_id);
2336
2337 if (talkative)
2338 fprintf(stderr, "Examining donor-acceptor pairings in %s\n", rt.get_name(ref_id));
2339 IntronMotifs& motifs = motif_itr->second;
2340
2341 if (!all_both)
2342 motifs.unique();
2343
2344 //motifs.attach_mer_counts(*ref_str);
2345 motifs.attach_mers(*ref_str);
2346
2347 vector<pair<size_t, DnaSpliceStrings> >& fwd_donors = motifs.fwd_donors;
2348 vector<pair<size_t, DnaSpliceStrings> >& fwd_acceptors = motifs.fwd_acceptors;
2349 vector<pair<size_t, DnaSpliceStrings> >& rev_acceptors = motifs.rev_acceptors;
2350 vector<pair<size_t, DnaSpliceStrings> >& rev_donors = motifs.rev_donors;
2351
2352 //const char* ref_name = rt.get_name(motif_itr->second.ref_id);
2353
2354 JunctionRecorder recorder;
2355 recorder.record(ref_id,
2356 fwd_donors,
2357 fwd_acceptors,
2358 false,
2359 juncs,
2360 min_intron,
2361 max_intron,
2362 max_juncs,
2363 half_splice_mer_len);
2364
2365 recorder.record(ref_id,
2366 rev_acceptors,
2367 rev_donors,
2368 true,
2369 juncs,
2370 min_intron,
2371 max_intron,
2372 max_juncs,
2373 half_splice_mer_len);
2374 }
2375 //fprintf(stderr, "Found %d total splices\n", num_juncs);
2376 }
2377
2378
2379 /**
2380 * Performs a simple global alignment.
2381 * This function will perform a restricted global alignment. The restriction is that only one insertion/deletion
2382 * is allowed in the final alignment.
2383 * @param shortSequence The short sequence to be aligned.
2384 * @param leftReference The left end of the reference to be aligned, must be exactly as long as the short sequence
2385 * @param leftReference The right end of the reference to be aligned, must be exactly as long as the short sequenc
2386 * @param insertPosition This will contain the 0-based index of the first position in the shorter sequence after the insertion/deletion. A value of -1 indicates that the alignment could not be performed.
2387 * @param mismatchCount This will contain the number of mismatches in the optimal restricted global alignment. The number and length of insertions/deletions is fixed. A value of -1 indicates that the alignment could not be performed.
2388 */
2389 void simpleSplitAlignment(seqan::String<char>& shorterSequence,
2390 seqan::String<char>& leftReference,
2391 seqan::String<char>& rightReference,
2392 int& insertPosition,
2393 int& mismatchCount)
2394 {
2395 /*
2396 * In this restricted alignment, we already know the length and number (1) of insertions/deletions.
2397 * We simply need to know where to put it. Do a linear scan through sequence counting the number of induced
2398 * errors before and after putting the insertion at each sequence.
2399 */
2400
2401 /*
2402 * Note that we could have a case, where both the alignment and the read have the unknonw
2403 * nucleotide ('N') and we don't want to reward cases where these characters match
2404 */
2405 vector<unsigned short> beforeErrors(seqan::length(shorterSequence));
2406 for(int idx = seqan::length(shorterSequence) - 1; idx >= 0; idx -= 1){
2407 unsigned short prevCount = 0;
2408 /*
2409 * We guarentee idx >= 0, so cast to hide the compiler
2410 * warning here
2411 */
2412 if(((size_t)idx) < seqan::length(shorterSequence) - 1){
2413 prevCount = beforeErrors.at(idx + 1);
2414 }
2415 unsigned short currentMismatch = 0;
2416 if(rightReference[idx] == 'N' || shorterSequence[idx] == 'N' || rightReference[idx] != shorterSequence[idx]){
2417 currentMismatch = 1;
2418 }
2419 beforeErrors.at(idx) = prevCount + currentMismatch;
2420 }
2421
2422
2423 vector<unsigned short> afterErrors(seqan::length(shorterSequence));
2424 for(size_t idx = 0; idx < seqan::length(shorterSequence) ; idx += 1){
2425 unsigned short prevCount = 0;
2426 if(idx > 0){
2427 prevCount = afterErrors.at(idx - 1);
2428 }
2429 unsigned short currentMismatch = 0;
2430 if(leftReference[idx] == 'N' || shorterSequence[idx] == 'N' || leftReference[idx] != shorterSequence[idx]){
2431 currentMismatch = 1;
2432 }
2433 afterErrors.at(idx) = prevCount + currentMismatch;
2434 }
2435
2436
2437 mismatchCount = seqan::length(shorterSequence) + 1;
2438 insertPosition = -1;
2439
2440 /*
2441 * Technically, we could allow the insert position to be at the end or beginning of the sequence,
2442 * but we are disallowing it here
2443 */
2444 for(size_t currentInsertPosition = 1; currentInsertPosition < seqan::length(shorterSequence); currentInsertPosition += 1){
2445 size_t errorCount = beforeErrors.at(currentInsertPosition) + afterErrors.at(currentInsertPosition - 1);
2446 if(((int)errorCount) < mismatchCount){
2447 mismatchCount = (int)errorCount;
2448 insertPosition = currentInsertPosition;
2449 }
2450 }
2451 return;
2452 }
2453
2454
2455 /**
2456 * Try to detect a small insertion.
2457 * This code will try to identify a small insertion based on the ungapped alignment of two neighboring
2458 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2459 * number of errors. Note that the function makes use of the global parameter "max_insertion_length" to limit the maximum
2460 * size of a detected insertion.
2461 * @param rt Sequence table used to lookup sequence information
2462 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2463 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2464 * @param insertions If an insertion is sucessfully detected, it will be added to this set
2465 */
2466 void detect_small_insertion(RefSequenceTable& rt,
2467 seqan::String<char>& read_sequence,
2468 BowtieHit& leftHit,
2469 BowtieHit& rightHit,
2470 std::set<Insertion>& insertions)
2471 {
2472
2473 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2474 if(!ref_str){
2475 fprintf(stderr, "Error in accessing sequence record\n");
2476 }else{
2477 size_t read_length = seqan::length(read_sequence);
2478 int begin_offset = 0;
2479 int end_offset = 0;
2480
2481 if(color){
2482 if(leftHit.antisense_align())
2483 end_offset = 1;
2484 else
2485 begin_offset = -1;
2486 }
2487
2488 if(leftHit.left() + begin_offset < 0)
2489 return;
2490
2491 /*
2492 * If there is in fact a deletion, we are expecting the genomic sequence to be shorter than
2493 * the actual read sequence
2494 */
2495 int discrepancy = read_length - (rightHit.right() - leftHit.left());
2496 DnaString genomic_sequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, rightHit.right() + end_offset);
2497 String<char> genomic_sequence;
2498 assign(genomic_sequence, genomic_sequence_temp);
2499
2500 if(color)
2501 genomic_sequence = convert_bp_to_color(genomic_sequence, true);
2502
2503 String<char> left_read_sequence = seqan::infix(read_sequence, 0, 0 + seqan::length(genomic_sequence));
2504 String<char> right_read_sequence = seqan::infix(read_sequence, read_length - seqan::length(genomic_sequence), read_length);
2505
2506 int bestInsertPosition = -1;
2507 int minErrors = -1;
2508 simpleSplitAlignment(genomic_sequence, left_read_sequence, right_read_sequence, bestInsertPosition, minErrors);
2509
2510 /*
2511 * Need to decide if the insertion is suitably improves the alignment
2512 */
2513 /*
2514 * If these two segments anchors constitue the entire read, then we require
2515 * that this alignment actually improve the number of errors observed in the alignment
2516 * Otherwise, it is OK as long as the number of errors doesn't increase.
2517 */
2518 int adjustment = 0;
2519 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2520 adjustment = -1;
2521 }
2522 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2523 String<char> insertedSequence = seqan::infix(left_read_sequence, bestInsertPosition, bestInsertPosition + discrepancy);
2524 if(color)
2525 insertedSequence = convert_color_to_bp(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1], insertedSequence);
2526
2527 insertions.insert(Insertion(leftHit.ref_id(),
2528 leftHit.left() + bestInsertPosition - 1 + end_offset,
2529 seqan::toCString(insertedSequence)));
2530 }
2531 }
2532 return;
2533 }
2534
2535 /**
2536 * Try to detect a small deletion.
2537 * This code will try to identify a small deletion based on the ungapped alignment of two neighboring
2538 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2539 * number of errors. Note that the function makes use of the global parameter "max_deletion_length" to limit the maximum
2540 * size of a detected deletion.
2541 * @param rt Sequence table used to lookup sequence information
2542 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2543 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2544 * @param deletion_juncs If a deletion is sucessfully detected, it will be added to this set
2545 */
2546 void detect_small_deletion(RefSequenceTable& rt,
2547 seqan::String<char>& read_sequence,
2548 BowtieHit& leftHit,
2549 BowtieHit& rightHit,
2550 std::set<Deletion>& deletions)
2551 {
2552
2553 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2554 if(!ref_str){
2555 fprintf(stderr, "Error in accessing sequence record\n");
2556 }else{
2557 int begin_offset = 0;
2558 int end_offset = 0;
2559
2560 if(color){
2561 if(leftHit.antisense_align())
2562 end_offset = 1;
2563 else
2564 begin_offset = -1;
2565 }
2566
2567 if(leftHit.left() + begin_offset < 0)
2568 return;
2569
2570 size_t read_length = seqan::length(read_sequence);
2571 if(rightHit.right() + read_length + begin_offset < 0 )
2572 return;
2573
2574 int discrepancy = (rightHit.right() - leftHit.left()) - read_length;
2575 Dna5String leftGenomicSequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, leftHit.left() + read_length + end_offset);
2576 Dna5String rightGenomicSequence_temp = seqan::infix(*ref_str, rightHit.right() - read_length + begin_offset, rightHit.right() + end_offset);
2577
2578 if (length(leftGenomicSequence_temp) < read_length || length(rightGenomicSequence_temp) < read_length)
2579 return;
2580
2581 String<char> leftGenomicSequence;
2582 assign(leftGenomicSequence, leftGenomicSequence_temp);
2583
2584 String<char> rightGenomicSequence;
2585 assign(rightGenomicSequence, rightGenomicSequence_temp);
2586
2587 if(color){
2588 leftGenomicSequence = convert_bp_to_color(leftGenomicSequence, true);
2589 rightGenomicSequence = convert_bp_to_color(rightGenomicSequence, true);
2590 }
2591
2592 int bestInsertPosition = -1;
2593 int minErrors = -1;
2594
2595 simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPosition, minErrors);
2596
2597 /*
2598 * Need to decide if the deletion is suitably improves the alignment
2599 */
2600 int adjustment = 0;
2601
2602 /*
2603 * If these two segments anchors constitue the entire read, then we require
2604 * that this alignment actually improve the number of errors observed in the alignment
2605 * Otherwise, it is OK as long as the number of errors doesn't increase.
2606 */
2607 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2608 adjustment = -1;
2609 }
2610 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2611 deletions.insert(Deletion(leftHit.ref_id(),
2612 leftHit.left() + bestInsertPosition - 1 + end_offset,
2613 leftHit.left() + bestInsertPosition + discrepancy + end_offset,
2614 false));
2615 }
2616 }
2617 return;
2618 }
2619
2620
2621 void find_insertions_and_deletions(RefSequenceTable& rt,
2622 FILE* reads_file,
2623 vector<HitsForRead>& hits_for_read,
2624 std::set<Deletion>& deletions,
2625 std::set<Insertion>& insertions){
2626
2627 if(hits_for_read.empty()){
2628 return;
2629 }
2630
2631 size_t last_segment = hits_for_read.size()-1;
2632 size_t first_segment = 0;
2633 if(last_segment == first_segment){
2634 return;
2635 }
2636
2637 /*
2638 * We can check up front whether the first or last element is empty
2639 * and avoid doing any more work. Note that the following code requires
2640 * that there be at least one elment in each
2641 */
2642 if(hits_for_read[first_segment].hits.empty() || hits_for_read[last_segment].hits.empty()){
2643 return;
2644 }
2645
2646 char read_name[1024];
2647 char read_seq[1024];
2648 char read_alt_name[1024];
2649 char read_quals[1024];
2650
2651 /*
2652 * Need to identify the appropriate insert id for this group of reads
2653 */
2654
2655 bool got_read = get_read_from_stream(hits_for_read.back().insert_id,
2656 reads_file,
2657 FASTQ,
2658 false,
2659 read_name,
2660 read_seq,
2661 read_alt_name,
2662 read_quals);
2663 if(!got_read){
2664 return;
2665 }
2666
2667 /*
2668 * Work through all combinations of mappings for the first and last segment to see if any are indicative
2669 * of a small insertions or deletion
2670 */
2671 HitsForRead& left_segment_hits = hits_for_read[first_segment];
2672 HitsForRead& right_segment_hits = hits_for_read[last_segment];
2673
2674 /*
2675 * If either of the segment match lists is empty, we could try
2676 * to be smarter and work our way in until we find good a segment
2677 * match; however, won't do that for noe.
2678 */
2679 if(left_segment_hits.hits.empty() || right_segment_hits.hits.empty()){
2680 return;
2681 }
2682
2683 seqan::String<char> fullRead, rcRead;
2684 if(color){
2685 fullRead = read_seq + 1;
2686 rcRead = fullRead;
2687 seqan::reverseInPlace(rcRead);
2688 }else{
2689 fullRead = read_seq;
2690 rcRead = read_seq;
2691 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
2692 seqan::reverseInPlace(rcRead);
2693 }
2694
2695 size_t read_length = seqan::length(fullRead);
2696 for(size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); left_segment_index++){
2697 for(size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); right_segment_index++){
2698 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
2699 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
2700 /*
2701 * Now we have found a pair of segment hits to investigate. Need to ensure
2702 * that
2703 * 1. the alignment orientation is consistent
2704 * 2. the distance separation is in the appropriate range
2705 * 3. Both hits are aligned to the same contig
2706 */
2707 if(leftHit->ref_id() != rightHit->ref_id()){
2708 continue;
2709 }
2710
2711 if(leftHit->antisense_align() != rightHit->antisense_align()){
2712 continue;
2713 }
2714
2715 seqan::String<char>* modifiedRead = &fullRead;
2716 /*
2717 * If we are dealing with an antisense alignment, then the left
2718 * read will actually be on the right, fix this now, to simplify
2719 * the rest of the logic, in addition, we will need to use the reverse
2720 * complement of the read sequence
2721 */
2722 if(leftHit->antisense_align()){
2723 BowtieHit * tmp = leftHit;
2724 leftHit = rightHit;
2725 rightHit = tmp;
2726 modifiedRead = &rcRead;
2727 }
2728
2729 size_t apparent_length = rightHit->right() - leftHit->left();
2730 int length_discrepancy = apparent_length - read_length;
2731 if(length_discrepancy > 0 && length_discrepancy <= (int)max_deletion_length){
2732 /*
2733 * Search for a deletion
2734 */
2735 detect_small_deletion(rt, *modifiedRead, *leftHit, *rightHit, deletions);
2736 }
2737 if(length_discrepancy < 0 && length_discrepancy >= -(int)max_insertion_length){
2738 /*
2739 * Search for an insertion
2740 */
2741 detect_small_insertion(rt, *modifiedRead, *leftHit, *rightHit, insertions);
2742 }
2743 }
2744 }
2745 }
2746
2747
2748 void find_gaps(RefSequenceTable& rt,
2749 FILE* reads_file,
2750 vector<HitsForRead>& hits_for_read,
2751 std::set<Junction, skip_count_lt>& seg_juncs,
2752 eREAD read)
2753 {
2754 if (hits_for_read.empty())
2755 return;
2756
2757 size_t last_non_empty = hits_for_read.size() - 1;
2758 while(last_non_empty >= 0 && hits_for_read[last_non_empty].hits.empty())
2759 --last_non_empty;
2760
2761 hits_for_read.resize(last_non_empty + 1);
2762
2763 char read_name[1024];
2764 char read_seq[1024];
2765 char read_alt_name[1024];
2766 char read_quals[1024];
2767 bool got_read = get_read_from_stream(hits_for_read[last_non_empty].insert_id,
2768 reads_file,
2769 FASTQ,
2770 false,
2771 read_name,
2772 read_seq,
2773 read_alt_name,
2774 read_quals);
2775
2776 if (!got_read)
2777 return;
2778
2779 vector<RefSeg> expected_don_acc_windows;
2780 string seq(read_seq);
2781
2782 for (size_t s = 0; s < hits_for_read.size(); ++s)
2783 {
2784 HitsForRead& curr = hits_for_read[s];
2785 for (size_t h = 0; h < curr.hits.size(); ++h)
2786 {
2787 bool found_right_partner = s == hits_for_read.size() - 1;
2788 bool found_distant_right_partner = false;
2789 bool found_right_right_partner = false;
2790 BowtieHit& bh = curr.hits[h];
2791
2792 // "dr" is distant right partner
2793 // "rr" is right of right partner
2794 BowtieHit dr_bh, rr_bh;
2795 if (s < hits_for_read.size() - 1)
2796 {
2797 // Look for a right partner for the current hit
2798 HitsForRead& right = hits_for_read[s + 1];
2799
2800 for (size_t r = 0; r < right.hits.size(); ++r)
2801 {
2802 BowtieHit& rh = right.hits[r];
2803 if (bh.antisense_align() != rh.antisense_align() || bh.ref_id() != rh.ref_id())
2804 continue;
2805
2806 if ((bh.antisense_align() && rh.right() == bh.left()) ||
2807 (!bh.antisense_align() && bh.right() == rh.left() ))
2808 {
2809 found_right_partner = true;
2810 break;
2811 }
2812
2813 int dist = 0;
2814 if (bh.antisense_align())
2815 dist = bh.left() - rh.right();
2816 else
2817 dist = rh.left() - bh.right();
2818
2819 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
2820 {
2821 found_distant_right_partner = true;
2822 dr_bh = rh;
2823 }
2824 }
2825 }
2826
2827 if (!found_right_partner && s < hits_for_read.size() - 2)
2828 {
2829 // Look for a right of right partner for the current hit
2830 HitsForRead& right_right = hits_for_read[s + 2];
2831
2832 for (size_t r = 0; r < right_right.hits.size(); ++r)
2833 {
2834 BowtieHit& rrh = right_right.hits[r];
2835 if (bh.antisense_align() != rrh.antisense_align() || bh.ref_id() != rrh.ref_id())
2836 continue;
2837
2838 int dist = 0;
2839 if (bh.antisense_align())
2840 dist = bh.left() - rrh.right();
2841 else
2842 dist = rrh.left() - bh.right();
2843
2844 if (dist >= min_segment_intron_length + segment_length && dist < (int)max_segment_intron_length + segment_length)
2845 {
2846 found_right_right_partner = true;
2847 rr_bh = rrh;
2848 break;
2849 }
2850 }
2851 }
2852
2853 if (!found_right_partner && (found_distant_right_partner || found_right_right_partner))
2854 {
2855 size_t color_offset = color ? 1 : 0;
2856
2857 string support_read;
2858 if (found_distant_right_partner)
2859 support_read = seq.substr(color_offset + (s+1) * segment_length - 5, 10);
2860 else
2861 support_read = seq.substr(color_offset + (s+1) * segment_length - 5, segment_length + 10);
2862
2863 BowtieHit& d_bh = found_distant_right_partner ? dr_bh : rr_bh;
2864 if (!bh.antisense_align())
2865 {
2866 RefSeg right_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read, 0, 0, support_read);
2867 right_seg.left = max(0, bh.right() - 5);
2868 right_seg.right = d_bh.left() + 5;
2869 expected_don_acc_windows.push_back(right_seg);
2870 }
2871 else
2872 {
2873 if (color)
2874 reverse(support_read.begin(), support_read.end());
2875 else
2876 reverse_complement(support_read);
2877
2878 RefSeg left_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read, 0, 0, support_read);
2879 left_seg.left = d_bh.right() - 5;
2880 left_seg.right = bh.left() + 5; // num allowed bowtie mismatches
2881 expected_don_acc_windows.push_back(left_seg);
2882 }
2883
2884 // daehwan
2885 if (bDebug)
2886 {
2887 cout << (bh.antisense_align() ? "-" : "+") << endl
2888 << seq << endl
2889 << "(" << s << ") - " << support_read << endl;
2890 }
2891 }
2892 }
2893 }
2894
2895 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
2896 expected_don_acc_windows,
2897 seg_juncs,
2898 "GT",
2899 "AG",
2900 max_segment_intron_length,
2901 min_segment_intron_length,
2902 max_seg_juncs,
2903 false,
2904 0);
2905
2906 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
2907 expected_don_acc_windows,
2908 seg_juncs,
2909 "GC",
2910 "AG",
2911 max_segment_intron_length,
2912 min_segment_intron_length,
2913 max_seg_juncs,
2914 false,
2915 0);
2916
2917 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
2918 expected_don_acc_windows,
2919 seg_juncs,
2920 "AT",
2921 "AC",
2922 max_segment_intron_length,
2923 min_segment_intron_length,
2924 max_seg_juncs,
2925 false,
2926 0);
2927 }
2928
2929
2930 MerTable mer_table;
2931
2932 int seed_alignments = 0;
2933 int microaligned_segs = 0;
2934
2935 map<RefSeg, vector<string>* > microexon_windows;
2936
2937 bool overlap_in_genome(int ll, int lr, int rl, int rr)
2938 {
2939 if (ll >= rl && ll < rr)
2940 return true;
2941 if (lr > rl && lr < rr)
2942 return true;
2943 if (rl >= ll && rl < lr)
2944 return true;
2945 if (rr > ll && rr < lr)
2946 return true;
2947 return false;
2948 }
2949
2950 void add_to_microexon_windows(uint32_t ref_id,
2951 int left_boundary,
2952 int right_boundary,
2953 const string& dna_str,
2954 eREAD read)
2955 {
2956 RefSeg left_dummy(ref_id, POINT_DIR_DONTCARE, false, read, left_boundary, right_boundary);
2957 RefSeg right_dummy(ref_id, POINT_DIR_DONTCARE, false, read, right_boundary, right_boundary + 1);
2958
2959 map<RefSeg, vector<string>* >::iterator lb = microexon_windows.lower_bound(left_dummy);
2960 map<RefSeg, vector<string>* >::iterator ub = microexon_windows.lower_bound(right_dummy);
2961 vector<string>* new_vec = NULL;
2962 if (lb == microexon_windows.end())
2963 {
2964 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
2965 return;
2966 }
2967
2968 map<RefSeg, vector<string>* >::iterator first_to_be_erased = microexon_windows.end();
2969 map<RefSeg, vector<string>* >::iterator last_to_be_erased = ub;
2970
2971 while (lb != ub)
2972 {
2973 // everyone in this range that overlaps with the new interval needs to
2974 // be merged together.
2975 if (overlap_in_genome(lb->first.left, lb->first.right, left_boundary, right_boundary))
2976 {
2977 if (!new_vec)
2978 new_vec = new vector<string>();
2979 if (first_to_be_erased == microexon_windows.end())
2980 first_to_be_erased = lb;
2981 left_dummy.left = min(lb->first.left, left_boundary);
2982 left_dummy.right = max(lb->first.right, right_boundary);
2983
2984 new_vec->insert(new_vec->end(), lb->second->begin(), lb->second->end());
2985 delete lb->second;
2986 }
2987 else if (first_to_be_erased != microexon_windows.end())
2988 {
2989 last_to_be_erased = lb;
2990 }
2991
2992 ++lb;
2993 }
2994
2995 if (first_to_be_erased != microexon_windows.end())
2996 {
2997 microexon_windows.erase(first_to_be_erased, last_to_be_erased);
2998 }
2999
3000 if (!new_vec)
3001 {
3002 // never found an overlapping window, so just add this one and bail
3003 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3004 return;
3005 }
3006 else
3007 {
3008 new_vec->push_back(dna_str);
3009 microexon_windows.insert(make_pair(left_dummy, new_vec));
3010 return;
3011 }
3012
3013 }
3014
3015 void align_microexon_segs(RefSequenceTable& rt,
3016 std::set<Junction, skip_count_lt>& juncs,
3017 int max_juncs,
3018 int half_splice_mer_len)
3019 {
3020 int num_segments = 0;
3021 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3022 itr != microexon_windows.end(); ++itr)
3023 {
3024 vector<string>& unaligned_segments = *itr->second;
3025 num_segments += unaligned_segments.size();
3026 }
3027
3028 fprintf(stderr, "Aligning %d microexon segments in %lu windows\n",
3029 num_segments, (long unsigned int)microexon_windows.size());
3030
3031 extensions.clear();
3032
3033 size_t splice_mer_len = 2 * half_splice_mer_len;
3034 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
3035
3036 extensions.resize(mer_table_size);
3037
3038 int window_num = 0;
3039 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3040 itr != microexon_windows.end(); ++itr)
3041 {
3042 window_num++;
3043 if ((window_num % 100) == 0)
3044 fprintf(stderr, "\twindow %d\n",window_num);
3045
3046
3047 stringstream ss(stringstream::in | stringstream::out);
3048
3049 for (size_t j = 0; j < extensions.size(); ++j)
3050 {
3051 extensions[j].clear();
3052 }
3053
3054 vector<string>& unaligned_segments = *itr->second;
3055
3056 for (size_t j = 0; j < unaligned_segments.size(); ++j)
3057 {
3058 stringstream ss(stringstream::in | stringstream::out);
3059 string s;
3060 //cerr << w.unaligned_segments[j];
3061 ss << unaligned_segments[j];
3062 ss >> s;
3063
3064 store_read_extensions(extensions,
3065 half_splice_mer_len,
3066 half_splice_mer_len,
3067 s,
3068 false);
3069 }
3070
3071 vector<RefSeg> segs;
3072 segs.push_back(itr->first);
3073 RefSeg r = itr->first;
3074 r.points_where = POINT_DIR_LEFT;
3075 segs.push_back(r);
3076
3077 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
3078 segs,
3079 juncs,
3080 "GT",
3081 "AG",
3082 max_microexon_stretch,
3083 min_coverage_intron_length,
3084 max_juncs,
3085 false,
3086 half_splice_mer_len);
3087 num_segments += unaligned_segments.size();
3088 delete itr->second;
3089 }
3090 fprintf(stderr, "Checked %d segments against %lu windows for microexon junctions\n",
3091 num_segments, (long unsigned int)microexon_windows.size());
3092 fprintf(stderr, "Found %ld potential microexon junctions\n", (long int)juncs.size());
3093 }
3094
3095 /*
3096 * Easy guys ... this function puts its pants on just like the rest of you -- one leg
3097 * at a time. Except, once its pants are on, it makes gold records. Alright, here we
3098 * go.
3099 */
3100
3101 void look_for_hit_group(RefSequenceTable& rt,
3102 //FILE* reads_file,
3103 FZPipe& reads_file,
3104 FZPipe& reads_file_for_segment_search,
3105 FZPipe& reads_file_for_indel_discovery,
3106 ReadTable& unmapped_reads,
3107 vector<HitStream>& seg_files,
3108 int curr_file,
3109 uint64_t insert_id,
3110 vector<HitsForRead>& hits_for_read,
3111 std::set<Junction, skip_count_lt>& juncs,
3112 std::set<Deletion>& deletions,
3113 std::set<Insertion>& insertions,
3114 eREAD read)
3115
3116 {
3117 HitStream& hit_stream = seg_files[curr_file];
3118 HitsForRead hit_group;
3119 uint32_t order = unmapped_reads.observation_order(insert_id);
3120 int seq_key_len = min((int)min_anchor_len, 6);
3121 while(true) {
3122 uint64_t next_group_id = hit_stream.next_group_id();
3123 uint32_t next_order = unmapped_reads.observation_order(next_group_id);
3124 // If we would have seen the hits by now, stop looking in this stream,
3125 // but forward the search if possible.
3126 if (order < next_order || next_group_id == 0) {
3127 if (curr_file > 0) {
3128 look_for_hit_group(rt,
3129 reads_file,
3130 reads_file_for_segment_search,
3131 reads_file_for_indel_discovery,
3132 unmapped_reads,
3133 seg_files,
3134 curr_file - 1,
3135 insert_id,
3136 hits_for_read,
3137 juncs,
3138 deletions,
3139 insertions,
3140 read);
3141 }
3142 else
3143 if (!no_microexon_search) {
3144 //microexon search
3145 char read_name[1024];
3146 char read_seq[1024];
3147 char read_alt_name[1024];
3148 char read_quals[1024];
3149 // The hits are missing for the leftmost segment, which means
3150 // we should try looking for junctions via seed and extend
3151 // using it (helps find junctions to microexons).
3152 bool got_read = get_read_from_stream(insert_id,
3153 reads_file.file,
3154 FASTQ,
3155 false,
3156 read_name,
3157 read_seq,
3158 read_alt_name,
3159 read_quals);
3160 if (!got_read) {
3161 fprintf(stderr, "Warning: could not get read with insert_id=%d\n", (int)insert_id);
3162 break; //exit loop
3163 }
3164 string fwd_read = read_seq;
3165 if (color) // remove the primer and the adjacent color
3166 fwd_read.erase(0, 2);
3167 // make sure there are hits for all the other segs, all the
3168 // way to the root (right-most) one.
3169 int empty_seg = 0;
3170 for (size_t h = 1; h < hits_for_read.size(); ++h) {
3171 if (hits_for_read[h].hits.empty())
3172 empty_seg = h;
3173 }
3174 // if not, no microexon alignment for this segment
3175 if (empty_seg != 0)
3176 break;
3177 fwd_read = fwd_read.substr(0, segment_length);
3178 string rev_read = fwd_read;
3179 //check the reverse
3180 if (color) reverse(rev_read.begin(), rev_read.end());
3181 else reverse_complement(rev_read);
3182 for (size_t h = 0; h < hits_for_read[empty_seg + 1].hits.size(); ++h) {
3183 const BowtieHit& bh = hits_for_read[empty_seg + 1].hits[h];
3184 RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id());
3185 if (ref_str == NULL)
3186 continue;
3187 int ref_len = length(*ref_str);
3188 int left_boundary;
3189 int right_boundary;
3190 bool antisense = bh.antisense_align();
3191 vector<BowtieHit> empty_seg_hits;
3192 seed_alignments++;
3193 if (antisense) {
3194 left_boundary = max(0, bh.right() - (int)min_anchor_len);
3195 right_boundary = min(ref_len - 2,
3196 left_boundary + max_microexon_stretch);
3197 if (right_boundary - left_boundary < 2 * seq_key_len)
3198 continue;
3199 microaligned_segs++;
3200 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, rev_read, read);
3201 }
3202 else {
3203 right_boundary = min(ref_len - 2,
3204 bh.left() + (int)min_anchor_len);
3205 left_boundary = max(0, right_boundary - max_microexon_stretch);
3206 if (right_boundary - left_boundary < 2 * seq_key_len)
3207 continue;
3208 microaligned_segs++;
3209 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, fwd_read, read);
3210 }
3211 } //for h
3212 } // !no_microexon_search
3213 break;
3214 }
3215 else if (hit_stream.next_read_hits(hit_group)) {
3216 // if we found hits for the target group in the left stream,
3217 // add them to the accumulating vector and continue the search
3218 if (hit_group.insert_id == insert_id) {
3219 hits_for_read[curr_file] = hit_group;
3220 if (curr_file > 0)
3221 // we need to look left (recursively) for the group we
3222 // just read for this stream.
3223 look_for_hit_group(rt,
3224 reads_file,
3225 reads_file_for_segment_search,
3226 reads_file_for_indel_discovery,
3227 unmapped_reads,
3228 seg_files,
3229 curr_file - 1,
3230 insert_id,
3231 hits_for_read,
3232 juncs,
3233 deletions,
3234 insertions,
3235 read);
3236 break;
3237 } //same insert_id (group)
3238 else if (curr_file > 0) {
3239 // different group, we need to start a whole new
3240 // search for it, with a whole new vector of hits.
3241 vector<HitsForRead> hits_for_new_read(seg_files.size());
3242 hits_for_new_read[curr_file] = hit_group;
3243 look_for_hit_group(rt,
3244 reads_file,
3245 reads_file_for_segment_search,
3246 reads_file_for_indel_discovery,
3247 unmapped_reads,
3248 seg_files,
3249 curr_file - 1,
3250 hit_group.insert_id,
3251 hits_for_new_read,
3252 juncs,
3253 deletions,
3254 insertions,
3255 read);
3256 find_insertions_and_deletions(rt, reads_file_for_indel_discovery.file, hits_for_new_read, deletions, insertions);
3257 find_gaps(rt, reads_file_for_segment_search.file, hits_for_new_read, juncs, read);
3258 } //different group
3259 }//got next group
3260 } //while loop
3261 }
3262
3263
3264
3265 bool process_next_hit_group(RefSequenceTable& rt,
3266 //FILE* reads_file,
3267 FZPipe& reads_file,
3268 FZPipe& reads_file_for_segment_search,
3269 FZPipe& reads_file_for_indel_discovery,
3270 ReadTable& unmapped_reads,
3271 vector<HitStream>& seg_files,
3272 size_t curr_file,
3273 std::set<Junction, skip_count_lt>& juncs,
3274 std::set<Deletion>& deletions,
3275 std::set<Insertion>& insertions,
3276 eREAD read)
3277 {
3278 HitStream& curr = seg_files[curr_file];
3279 HitsForRead hit_group;
3280 bool result = curr.next_read_hits(hit_group);
3281 vector<HitsForRead> hits_for_read(seg_files.size());
3282 hits_for_read.back() = hit_group;
3283
3284 look_for_hit_group(rt,
3285 reads_file,
3286 reads_file_for_segment_search,
3287 reads_file_for_indel_discovery,
3288 unmapped_reads,
3289 seg_files,
3290 (int)curr_file - 1,
3291 hit_group.insert_id,
3292 hits_for_read,
3293 juncs,
3294 deletions,
3295 insertions,
3296 read);
3297
3298 if (result) {
3299 find_insertions_and_deletions(rt, reads_file_for_indel_discovery.file, hits_for_read, deletions, insertions);
3300 find_gaps(rt, reads_file_for_segment_search.file, hits_for_read, juncs, read);
3301 }
3302 return result;
3303 }
3304
3305 static const int UNCOVERED = 0;
3306 static const int LOOK_LEFT = 1;
3307 static const int LOOK_RIGHT = 2;
3308
3309 uint8_t get_cov(const vector<uint8_t>& cov, uint32_t c)
3310 {
3311 uint32_t b = c >> 2;
3312 uint32_t r = c & 0x3;
3313 uint8_t s = (r << 1);
3314 uint8_t v = cov[b];
3315 v &= (0x3 << s);
3316 v >>= s;
3317 return v;
3318 }
3319
3320 void pair_covered_sites(ReadTable& it,
3321 RefSequenceTable& rt,
3322 vector<FZPipe*>& seg_files,
3323 std::set<Junction, skip_count_lt>& cov_juncs,
3324 size_t half_splice_mer_len)
3325 {
3326 BowtieHitFactory hit_factory(it,rt);
3327 map<uint32_t, vector<bool> > coverage_map;
3328 vector<RefSeg> expected_look_left_windows;
3329 vector<RefSeg> expected_look_right_windows;
3330
3331 for (size_t f = 0; f < seg_files.size(); ++f)
3332 {
3333 fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
3334 seg_files[f]->rewind();
3335 FILE* fp = seg_files[f]->file;
3336 //rewind(fp);
3337 HitStream hs(fp, &hit_factory, false, false, false);
3338
3339 HitsForRead hit_group;
3340 while (hs.next_read_hits(hit_group))
3341 {
3342 for (size_t h = 0; h < hit_group.hits.size(); ++h)
3343 {
3344 BowtieHit& bh = hit_group.hits[h];
3345
3346 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
3347 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
3348 vector<bool>& ref_cov = ret.first->second;
3349
3350 size_t right_extent = bh.right();
3351
3352 if (right_extent >= ref_cov.size())
3353 {
3354 ref_cov.resize(right_extent + 1, 0);
3355 }
3356
3357 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
3358 {
3359 ref_cov[c] = true;
3360 }
3361 }
3362 }
3363 }
3364
3365 static const int extend = 45;
3366 int num_islands = 0;
3367
3368 vector<RefSeg> expected_don_acc_windows;
3369
3370 fprintf(stderr, "Recording coverage islands\n");
3371 size_t cov_bases = 0;
3372 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
3373 itr != coverage_map.end();
3374 ++itr)
3375 {
3376 vector<bool>& cov = itr->second;
3377
3378 size_t island_left_edge = 0;
3379 for (size_t c = 1; c < cov.size(); ++c)
3380 {
3381 if (cov[c])
3382 {
3383 cov_bases++;
3384 if (!cov[c - 1])
3385 {
3386 num_islands += 1;
3387
3388 int edge = (int)c - extend;
3389 edge = max(edge, 0);
3390 island_left_edge = edge;
3391 }
3392 }
3393 else
3394 {
3395 if (cov[c - 1])
3396 {
3397 expected_don_acc_windows.push_back(RefSeg(itr->first,
3398 POINT_DIR_LEFT,
3399 false, /* not important */
3400 READ_DONTCARE,
3401 island_left_edge,
3402 c + extend));
3403 expected_don_acc_windows.push_back(RefSeg(itr->first,
3404 POINT_DIR_RIGHT,
3405 false, /* not important */
3406 READ_DONTCARE,
3407 island_left_edge,
3408 c + extend));
3409 }
3410 }
3411 }
3412 }
3413 fprintf(stderr, "Found %d islands covering %ld bases\n", num_islands, (long int)cov_bases);
3414
3415 juncs_from_ref_segs<RecordButterflyJuncs>(rt,
3416 expected_don_acc_windows,
3417 cov_juncs,
3418 "GT",
3419 "AG",
3420 max_coverage_intron_length,
3421 min_coverage_intron_length,
3422 max_cov_juncs,
3423 true,
3424 half_splice_mer_len);
3425 fprintf(stderr, "Found %ld potential intra-island junctions\n", (long int)cov_juncs.size());
3426 }
3427
3428 void capture_island_ends(ReadTable& it,
3429 RefSequenceTable& rt,
3430 //vector<FILE*>& seg_files,
3431 vector<FZPipe*>& seg_files,
3432 std::set<Junction, skip_count_lt>& cov_juncs,
3433 size_t half_splice_mer_len)
3434 {
3435 //static int island_repeat_tolerance = 10;
3436 BowtieHitFactory hit_factory(it,rt);
3437 map<uint32_t, vector<bool> > coverage_map;
3438 vector<RefSeg> expected_look_left_windows;
3439 vector<RefSeg> expected_look_right_windows;
3440
3441 for (size_t f = 0; f < seg_files.size(); ++f)
3442 {
3443 fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
3444 seg_files[f]->rewind();
3445 FILE* fp = seg_files[f]->file;
3446 //rewind(fp);
3447 HitStream hs(fp, &hit_factory, false, false, false);
3448
3449 HitsForRead hit_group;
3450 while (hs.next_read_hits(hit_group))
3451 {
3452 for (size_t h = 0; h < hit_group.hits.size(); ++h)
3453 {
3454 BowtieHit& bh = hit_group.hits[h];
3455
3456 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
3457 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
3458 vector<bool>& ref_cov = ret.first->second;
3459
3460 size_t right_extent = bh.right();
3461
3462 if (right_extent >= ref_cov.size())
3463 {
3464 ref_cov.resize(right_extent + 1, 0);
3465 }
3466
3467 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
3468 {
3469 ref_cov[c] = true;
3470 }
3471 }
3472 }
3473 }
3474
3475 static const int min_cov_length = segment_length + 2;
3476
3477 long covered_bases = 0;
3478 int long_enough_bases = 0;
3479 int left_looking = 0;
3480 int right_looking = 0;
3481 static const int extend = 45;
3482 static const int repeat_tol = 5;
3483
3484 int num_islands = 0;
3485
3486 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
3487 itr != coverage_map.end();
3488 ++itr)
3489 {
3490 //fprintf (stderr, "Finding pairings in ref seg %u\n", (int)itr->first);
3491
3492 vector<bool>& cov = itr->second;
3493 vector<bool> long_enough(cov.size());
3494
3495 size_t last_uncovered = 0;
3496
3497 static const uint8_t min_cov = 1;
3498
3499 for (size_t c = 1; c < cov.size(); ++c)
3500 {
3501 int c_cov = cov[c]; //get_cov(cov, c);
3502 if (c_cov < min_cov || c == (cov.size()) - 1)
3503 {
3504 int putative_exon_length = (int)c - (int)last_uncovered;
3505 int last_pos_cov = cov[c - 1]; //get_cov(cov,c - 1);
3506 if (last_pos_cov >= min_cov && putative_exon_length >= min_cov_length)
3507 {
3508 //fprintf (stdout, "%s\t%d\t%d\n", rt.get_name(itr->first), last_uncovered + 1, c);
3509 covered_bases += (c + 1 - last_uncovered);
3510 //fprintf(stderr, "making new segment from %d to %d\n", last_uncovered + 1, c);
3511 for (int l = (int)c; l > (int)last_uncovered; --l)
3512 {
3513 long_enough[l] = true;
3514 }
3515 }
3516 last_uncovered = c;
3517 }
3518 }
3519
3520 vector<bool>& ref_cov = long_enough;
3521 vector<uint8_t> cov_state(ref_cov.size(), UNCOVERED);
3522
3523 for (size_t c = 1; c < ref_cov.size(); ++c)
3524 {
3525 if (ref_cov[c])
3526 {
3527 long_enough_bases++;
3528 if (!ref_cov[c - 1])
3529 {
3530 num_islands += 1;
3531
3532 for (int r = (int)c - extend;
3533 r >= 0 && r < (int)c + repeat_tol && r < (int)cov_state.size();
3534 ++r)
3535 {
3536 cov_state[r] |= LOOK_LEFT;
3537 }
3538 }
3539 }
3540 else
3541 {
3542 if (ref_cov[c - 1])
3543 {
3544 for (int l = (int)c - repeat_tol;
3545 l >= 0 && l < (int)c + extend && l < (int)cov_state.size();
3546 ++l)
3547 {
3548 cov_state[l] |= LOOK_RIGHT;
3549 }
3550 }
3551 }
3552 }
3553
3554 RefSeg* curr_look_left = NULL;
3555 RefSeg* curr_look_right = NULL;
3556
3557 for (size_t c = 1; c < cov_state.size(); ++c)
3558 {
3559 if (cov_state[c] & LOOK_LEFT)
3560 {
3561 left_looking++;
3562 if (!(cov_state[c-1] & LOOK_LEFT))
3563 {
3564 expected_look_left_windows.push_back(RefSeg(itr->first,
3565 POINT_DIR_LEFT,
3566 false, /* not important */
3567 READ_DONTCARE,
3568 c,
3569 c + 1));
3570 curr_look_left = &(expected_look_left_windows.back());
3571 }
3572 else if (curr_look_left)
3573 {
3574 curr_look_left->right++;
3575 }
3576 }
3577 else
3578 {
3579 if ((cov_state[c-1] & LOOK_LEFT))
3580 {
3581 curr_look_left = NULL;
3582 }
3583 }
3584
3585 if (cov_state[c] & LOOK_RIGHT)
3586 {
3587 right_looking++;
3588 if (!(cov_state[c-1] & LOOK_RIGHT))
3589 {
3590 expected_look_right_windows.push_back(RefSeg(itr->first,
3591 POINT_DIR_RIGHT,
3592 false, /* not important */
3593 READ_DONTCARE,
3594 c,
3595 c + 1));
3596
3597 curr_look_right = &(expected_look_right_windows.back());
3598 }
3599 else if (curr_look_right)
3600 {
3601 curr_look_right->right++;
3602 }
3603 }
3604 else
3605 {
3606 if ((cov_state[c-1] & LOOK_RIGHT))
3607 {
3608 curr_look_right = NULL;
3609 }
3610 }
3611 }
3612 }
3613
3614 fprintf(stderr, "Map covers %ld bases\n", covered_bases);
3615 fprintf(stderr, "Map covers %d bases in sufficiently long segments\n", long_enough_bases);
3616 fprintf(stderr, "Map contains %d good islands\n", num_islands + 1);
3617 fprintf(stderr, "%d are left looking bases\n", left_looking);
3618 fprintf(stderr, "%d are right looking bases\n", right_looking);
3619
3620 vector<RefSeg> expected_don_acc_windows;
3621 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
3622 expected_look_right_windows.begin(),
3623 expected_look_right_windows.end());
3624
3625 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
3626 expected_look_left_windows.begin(),
3627 expected_look_left_windows.end());
3628
3629 coverage_map.clear();
3630 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
3631 expected_don_acc_windows,
3632 cov_juncs,
3633 "GT",
3634 "AG",
3635 max_coverage_intron_length,
3636 min_coverage_intron_length,
3637 max_cov_juncs,
3638 true,
3639 half_splice_mer_len);
3640 fprintf(stderr, "Found %ld potential island-end pairing junctions\n", (long int)cov_juncs.size());
3641 }
3642
3643
3644 void process_segment_hits(RefSequenceTable& rt,
3645 //FILE* reads_file,
3646 FZPipe& reads_file,
3647 FZPipe& reads_file_for_segment_search,
3648 FZPipe& reads_file_for_indel_discovery,
3649 //vector<FILE*>& seg_files,
3650 vector<FZPipe>& seg_files,
3651 BowtieHitFactory& hit_factory,
3652 ReadTable& it,
3653 std::set<Junction, skip_count_lt>& juncs,
3654 std::set<Deletion>& deletions,
3655 std::set<Insertion>& insertions,
3656 eREAD read = READ_DONTCARE)
3657 {
3658 //get_seqs(reads_stream, it, false, false);
3659 //get_seqs(reads_file, it, reads_format, false, false);
3660
3661 vector<HitStream> hit_streams;
3662 for (size_t i = 0; i < seg_files.size(); ++i)
3663 {
3664 HitStream hs(seg_files[i].file, &hit_factory, false, false, false);
3665
3666 // if the next group id in this stream is zero, it's an empty stream,
3667 // and we can simply skip it.
3668 //if (hs.next_group_id() != 0)
3669 hit_streams.push_back(hs);
3670 }
3671
3672 int num_group = 0;
3673 while (process_next_hit_group(rt,
3674 reads_file,
3675 reads_file_for_segment_search,
3676 reads_file_for_indel_discovery,
3677 it,
3678 hit_streams,
3679 hit_streams.size() - 1,
3680 juncs, deletions, insertions, read) == true)
3681 {
3682 num_group++;
3683 if (num_group % 500000 == 0)
3684 fprintf(stderr, "\tProcessed %d root segment groups\n", num_group);
3685 }
3686 fprintf(stderr, "Microaligned %d segments\n", microaligned_segs);
3687 }
3688
3689 void print_juncs(RefSequenceTable& rt, std::set<Junction, skip_count_lt>& juncs, const char* str)
3690 {
3691 fprintf (stderr, "-- %s --\n", str);
3692 for(std::set<Junction>::iterator itr = juncs.begin();
3693 itr != juncs.end();
3694 ++itr)
3695 {
3696 const char* ref_name = rt.get_name(itr->refid);
3697
3698 fprintf(stderr, "%s\t%d\t%d\t%c\n",
3699 ref_name,
3700 itr->left,
3701 itr->right,
3702 itr->antisense ? '-' : '+');
3703 }
3704 fprintf (stderr, "-- done --\n");
3705 }
3706
3707 void driver(istream& ref_stream,
3708 FILE* juncs_out,
3709 FILE* insertions_out,
3710 FILE* deletions_out,
3711 //FILE* left_reads_file,
3712 FZPipe& left_reads_file,
3713 FZPipe& left_reads_file_for_segment_search,
3714 FZPipe& left_reads_file_for_indel_discovery,
3715 vector<FZPipe>& left_seg_files,
3716 FZPipe& right_reads_file,
3717 FZPipe& right_reads_file_for_segment_search,
3718 FZPipe& right_reads_file_for_indel_discovery,
3719 vector<FZPipe>& right_seg_files)
3720 {
3721 if (left_seg_files.size() == 0)
3722 {
3723 fprintf(stderr, "No hits to process, exiting\n");
3724 exit(0);
3725 }
3726
3727 std::set<Junction, skip_count_lt> seg_juncs;
3728 std::set<Junction, skip_count_lt> cov_juncs;
3729 std::set<Junction, skip_count_lt> butterfly_juncs;
3730
3731 std::set<Junction> juncs;
3732
3733 std::set<Deletion> deletions;
3734 std::set<Insertion> insertions;
3735
3736
3737 RefSequenceTable rt(true, true);
3738
3739 fprintf (stderr, "Loading reference sequences...\n");
3740 get_seqs(ref_stream, rt, true, false);
3741
3742 ReadTable it;
3743 BowtieHitFactory hit_factory(it,rt);
3744
3745 if (left_seg_files.size() > 1)
3746 {
3747 fprintf( stderr, "Loading left segment hits...\n");
3748 process_segment_hits(rt,
3749 left_reads_file,
3750 left_reads_file_for_segment_search,
3751 left_reads_file_for_indel_discovery,
3752 left_seg_files,
3753 hit_factory,
3754 it,
3755 seg_juncs,
3756 deletions,
3757 insertions,
3758 READ_LEFT);
3759 fprintf( stderr, "done.\n");
3760 }
3761
3762 if (right_seg_files.size() > 1)
3763 {
3764 fprintf( stderr, "Loading right segment hits...\n");
3765 process_segment_hits(rt,
3766 right_reads_file,
3767 right_reads_file_for_segment_search,
3768 right_reads_file_for_indel_discovery,
3769 right_seg_files,
3770 hit_factory,
3771 it,
3772 seg_juncs,
3773 deletions,
3774 insertions,
3775 READ_RIGHT);
3776 fprintf( stderr, "done.\n");
3777 }
3778 fprintf(stderr, "Found %ld potential split-segment junctions\n", (long int)seg_juncs.size());
3779 fprintf(stderr, "Found %ld potential small deletions\n", (long int)deletions.size());
3780 fprintf(stderr, "Found %ld potential small insertions\n", (long int)insertions.size());
3781
3782 //vector<FILE*> all_seg_files;
3783 vector<FZPipe*> all_seg_files;
3784 for (vector<FZPipe>::size_type i = 0; i != left_seg_files.size();i++) {
3785 all_seg_files.push_back( &(left_seg_files[i]) );
3786 }
3787 for (vector<FZPipe>::size_type i = 0; i != right_seg_files.size();i++) {
3788 all_seg_files.push_back( &(right_seg_files[i]) );
3789 }
3790 //copy(left_seg_files.begin(), left_seg_files.end(), back_inserter(all_seg_files));
3791 //copy(right_seg_files.begin(), right_seg_files.end(), back_inserter(all_seg_files));
3792
3793 #if 0
3794 // daehwan - check this out as Cole insists on using segments gives better results.
3795 vector<FILE*> all_map_files;
3796 all_map_files.push_back(left_reads_map_file);
3797 all_map_files.push_back(right_reads_map_file);
3798 copy(all_seg_files.begin(), all_seg_files.end(), back_inserter(all_map_files));
3799 #endif
3800
3801 if (!no_coverage_search || butterfly_search)
3802 {
3803 if (ium_reads != "")
3804 {
3805 vector<string> ium_read_files;
3806 tokenize(ium_reads,",", ium_read_files);
3807 //vector<FILE*> iums;
3808 vector<FZPipe> iums;
3809 string unzcmd=getUnpackCmd(ium_read_files[0],false);
3810 for (size_t ium = 0; ium < ium_read_files.size(); ++ium)
3811 {
3812 fprintf (stderr, "Indexing extensions in %s\n", ium_read_files[ium].c_str());
3813 //FILE* ium_file = fopen(ium_read_files[ium].c_str(), "r");
3814 FZPipe ium_file(ium_read_files[ium],unzcmd);
3815 if (ium_file.file==NULL)
3816 {
3817 fprintf (stderr, "Can't open file %s for reading, skipping...\n",ium_read_files[ium].c_str());
3818 continue;
3819 }
3820 iums.push_back(ium_file);
3821 }
3822
3823 index_read_mers(iums, 5);
3824 }
3825 else
3826 {
3827 no_coverage_search = true;
3828 butterfly_search = false;
3829 }
3830
3831 if (!no_coverage_search)
3832 {
3833 fprintf(stderr, "Looking for junctions by island end pairings\n");
3834 capture_island_ends(it,
3835 rt,
3836 all_seg_files,
3837 cov_juncs,
3838 5);
3839 fprintf(stderr, "done\n");
3840 }
3841 }
3842
3843 if (butterfly_search)
3844 {
3845 fprintf(stderr, "Looking for junctions between and within islands\n");
3846 prune_extension_table(butterfly_overhang);
3847 compact_extension_table();
3848 pair_covered_sites(it,
3849 rt,
3850 all_seg_files,
3851 butterfly_juncs,
3852 5);
3853
3854 fprintf(stderr, "done\n");
3855 }
3856
3857 std::set<Junction, skip_count_lt> microexon_juncs;
3858 if (!no_microexon_search)
3859 {
3860 std::set<Junction, skip_count_lt> microexon_juncs;
3861 align_microexon_segs(rt,
3862 microexon_juncs,
3863 max_cov_juncs,
3864 5);
3865 juncs.insert(microexon_juncs.begin(), microexon_juncs.end());
3866 }
3867
3868 juncs.insert(cov_juncs.begin(), cov_juncs.end());
3869 juncs.insert(seg_juncs.begin(), seg_juncs.end());
3870 juncs.insert(butterfly_juncs.begin(), butterfly_juncs.end());
3871
3872 fprintf(stderr, "Reporting potential splice junctions...");
3873 for(std::set<Junction>::iterator itr = juncs.begin();
3874 itr != juncs.end();
3875 ++itr)
3876 {
3877 const char* ref_name = rt.get_name(itr->refid);
3878
3879 fprintf(juncs_out,
3880 "%s\t%d\t%d\t%c\n",
3881 ref_name,
3882 itr->left,
3883 itr->right,
3884 itr->antisense ? '-' : '+');
3885 }
3886 //close all reading pipes, just to exit cleanly
3887
3888 for (vector<FZPipe*>::size_type i = 0; i != all_seg_files.size();i++) {
3889 all_seg_files[i]->close();
3890 }
3891 left_reads_file.close();
3892 left_reads_file_for_indel_discovery.close();
3893 right_reads_file.close();
3894 right_reads_file_for_indel_discovery.close();
3895 fprintf (stderr, "done\n");
3896 fprintf (stderr, "Reported %d total possible splices\n", (int)juncs.size());
3897
3898
3899 fprintf (stderr, "Reporting potential deletions...\n");
3900 if(deletions_out){
3901 for(std::set<Deletion>::iterator itr = deletions.begin(); itr != deletions.end(); ++itr){
3902 const char* ref_name = rt.get_name(itr->refid);
3903 /*
3904 * We fix up the left co-ordinate to reference the first deleted base
3905 */
3906 fprintf(deletions_out,
3907 "%s\t%d\t%d\n",
3908 ref_name,
3909 itr->left + 1,
3910 itr->right);
3911 }
3912 fclose(deletions_out);
3913 }else{
3914 fprintf(stderr, "Failed to open deletions file for writing, no deletions reported\n");
3915 }
3916
3917 fprintf (stderr, "Reporting potential insertions...\n");
3918 if(insertions_out){
3919 for(std::set<Insertion>::iterator itr = insertions.begin(); itr != insertions.end(); ++itr){
3920 const char* ref_name = rt.get_name(itr->refid);
3921 fprintf(insertions_out,
3922 "%s\t%d\t%d\t%s\n",
3923 ref_name,
3924 itr->left,
3925 itr->left,
3926 itr->sequence.c_str());
3927 }
3928 fclose(insertions_out);
3929 }else{
3930 fprintf(stderr, "Failed to open insertions file for writing, no insertions reported\n");
3931 }
3932 }
3933
3934 int main(int argc, char** argv)
3935 {
3936 fprintf(stderr, "segment_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
3937 fprintf(stderr, "---------------------------\n");
3938
3939 int parse_ret = parse_options(argc, argv, print_usage);
3940 if (parse_ret)
3941 return parse_ret;
3942
3943 if(optind >= argc)
3944 {
3945 print_usage();
3946 return 1;
3947 }
3948
3949 string ref_file_name = argv[optind++];
3950
3951 if(optind >= argc)
3952 {
3953 print_usage();
3954 return 1;
3955 }
3956
3957 string juncs_file_name = argv[optind++];
3958
3959 if(optind >= argc)
3960 {
3961 print_usage();
3962 return 1;
3963 }
3964
3965 string insertions_file_name = argv[optind++];
3966 if(optind >= argc)
3967 {
3968 print_usage();
3969 return 1;
3970 }
3971
3972 string deletions_file_name = argv[optind++];
3973 if(optind >= argc)
3974 {
3975 print_usage();
3976 return 1;
3977 }
3978
3979 string left_reads_file_name = argv[optind++];
3980
3981 if(optind >= argc)
3982 {
3983 print_usage();
3984 return 1;
3985 }
3986
3987 string left_reads_map_file_name = argv[optind++];
3988
3989 if(optind >= argc)
3990 {
3991 print_usage();
3992 return 1;
3993 }
3994
3995 string left_segment_file_list = argv[optind++];
3996
3997 string right_segment_file_list;
3998 string right_reads_file_name;
3999 string right_reads_map_file_name;
4000 if (optind < argc)
4001 {
4002 right_reads_file_name = argv[optind++];
4003
4004 if(optind >= argc)
4005 {
4006 print_usage();
4007 return 1;
4008 }
4009
4010 right_reads_map_file_name = argv[optind++];
4011
4012 if(optind >= argc)
4013 {
4014 print_usage();
4015 return 1;
4016 }
4017 right_segment_file_list = argv[optind++];
4018 }
4019
4020 // Open the approppriate files
4021
4022 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
4023 if (!ref_stream.good())
4024 {
4025 fprintf(stderr, "Error: cannot open %s for reading\n",
4026 ref_file_name.c_str());
4027 exit(1);
4028 }
4029
4030
4031 FILE* juncs_file = fopen(juncs_file_name.c_str(), "w");
4032 if (!juncs_file)
4033 {
4034 fprintf(stderr, "Error: cannot open %s for writing\n",
4035 juncs_file_name.c_str());
4036 exit(1);
4037 }
4038
4039 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
4040 if (!insertions_file)
4041 {
4042 fprintf(stderr, "Error: cannot open %s for writing\n",
4043 insertions_file_name.c_str());
4044 exit(1);
4045 }
4046
4047
4048 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
4049 if (!deletions_file)
4050 {
4051 fprintf(stderr, "Error: cannot open %s for writing\n",
4052 deletions_file_name.c_str());
4053 exit(1);
4054 }
4055
4056 //FILE* left_reads_file = fopen(left_reads_file_name.c_str(), "r");
4057 //FILE* left_reads_file_for_indel_discovery = fopen(left_reads_file_name.c_str(),"r");
4058 string unzcmd=getUnpackCmd(left_reads_file_name, false);
4059 FZPipe left_reads_file(left_reads_file_name, unzcmd);
4060 FZPipe left_reads_file_for_segment_search(left_reads_file_name, unzcmd);
4061 FZPipe left_reads_file_for_indel_discovery(left_reads_file_name, unzcmd);
4062 if (left_reads_file.file==NULL || left_reads_file_for_segment_search.file==NULL ||
4063 left_reads_file_for_indel_discovery.file==NULL)
4064 {
4065 fprintf(stderr, "Error: cannot open %s for reading\n",
4066 left_reads_file_name.c_str());
4067 exit(1);
4068 }
4069
4070 //FILE* left_reads_map_file = fopen(left_reads_map_file_name.c_str(), "r");
4071 FZPipe left_reads_map_file(left_reads_map_file_name, unzcmd);
4072 if (left_reads_map_file.file==NULL)
4073 {
4074 fprintf(stderr, "Error: cannot open %s for reading\n",
4075 left_reads_map_file_name.c_str());
4076 exit(1);
4077 }
4078
4079 vector<string> left_segment_file_names;
4080 //vector<FILE*> left_segment_files;
4081 vector<FZPipe> left_segment_files;
4082 tokenize(left_segment_file_list, ",",left_segment_file_names);
4083 for (size_t i = 0; i < left_segment_file_names.size(); ++i)
4084 {
4085 //FILE* seg_file = fopen(left_segment_file_names[i].c_str(), "r");
4086 FZPipe seg_file(left_segment_file_names[i], unzcmd);
4087 if (seg_file.file == NULL)
4088 {
4089 fprintf(stderr, "Error: cannot open segment map file %s for reading\n",
4090 left_segment_file_names[i].c_str());
4091 exit(1);
4092 }
4093 left_segment_files.push_back(seg_file);
4094 }
4095
4096 //vector<FILE*> right_segment_files;
4097 vector<FZPipe> right_segment_files;
4098 //FILE* right_reads_file = NULL;
4099 FZPipe right_reads_file;
4100 //FILE* right_reads_file_for_indel_discovery = NULL;
4101 FZPipe right_reads_file_for_segment_search;
4102 FZPipe right_reads_file_for_indel_discovery;
4103
4104 if (right_segment_file_list != "")
4105 {
4106 //right_reads_file = fopen(right_reads_file_name.c_str(), "r");
4107 right_reads_file.openRead(right_reads_file_name, unzcmd);
4108 //right_reads_file_for_indel_discovery = fopen(right_reads_file_name.c_str(), "r");
4109 right_reads_file_for_segment_search.openRead(right_reads_file_name, unzcmd);
4110 right_reads_file_for_indel_discovery.openRead(right_reads_file_name, unzcmd);
4111 if (right_reads_file.file==NULL || right_reads_file_for_indel_discovery.file==NULL)
4112 {
4113 fprintf(stderr, "Error: cannot open %s for reading\n",
4114 right_reads_file_name.c_str());
4115 exit(1);
4116 }
4117
4118 vector<string> right_segment_file_names;
4119
4120 tokenize(right_segment_file_list, ",",right_segment_file_names);
4121 for (size_t i = 0; i < right_segment_file_names.size(); ++i)
4122 {
4123 //FILE* seg_file = fopen(right_segment_file_names[i].c_str(), "r");
4124 FZPipe seg_file(right_segment_file_names[i],unzcmd);
4125 if (seg_file.file == NULL)
4126 {
4127 fprintf(stderr, "Error: cannot open segment map %s for reading\n",
4128 right_segment_file_names[i].c_str());
4129 exit(1);
4130 }
4131 right_segment_files.push_back(seg_file);
4132 }
4133 }
4134
4135 driver(ref_stream,
4136 juncs_file,
4137 insertions_file,
4138 deletions_file,
4139 left_reads_file,
4140 left_reads_file_for_segment_search,
4141 left_reads_file_for_indel_discovery,
4142 left_segment_files,
4143 right_reads_file,
4144 right_reads_file_for_segment_search,
4145 right_reads_file_for_indel_discovery,
4146 right_segment_files);
4147
4148 return 0;
4149 }