ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (13 years, 1 month ago) by gpertea
File size: 47754 byte(s)
Log Message:
adding tophat source work

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * long_spanning_reads.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <cstdlib>
22 #include <cstring>
23 #include <bitset>
24 //#include <stdexcept>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <seqan/modifier.h>
32 #include <getopt.h>
33
34 #include "common.h"
35 #include "bwt_map.h"
36 #include "tokenize.h"
37 #include "segments.h"
38 #include "reads.h"
39
40 #include "junctions.h"
41 #include "insertions.h"
42 #include "deletions.h"
43
44 using namespace seqan;
45 using namespace std;
46
47 void print_usage()
48 {
49 fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
50 }
51
52 bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
53 const pair<uint32_t, HitsForRead>& rhs)
54 {
55 return lhs.first < rhs.first;
56 }
57
58 void get_seqs(istream& ref_stream,
59 RefSequenceTable& rt,
60 bool keep_seqs = true,
61 bool strip_slash = false)
62 {
63 while(ref_stream.good() &&
64 !ref_stream.eof())
65 {
66 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
67 string name;
68 readMeta(ref_stream, name, Fasta());
69 string::size_type space_pos = name.find_first_of(" \t\r");
70 if (space_pos != string::npos)
71 {
72 name.resize(space_pos);
73 }
74 seqan::read(ref_stream, *ref_str, Fasta());
75
76 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
77 }
78 }
79
80
81 void look_right_for_hit_group(ReadTable& unmapped_reads,
82 vector<HitStream>& contig_hits,
83 size_t curr_file,
84 vector<HitStream>& spliced_hits,
85 const HitsForRead& targets,
86 vector<HitsForRead>& seg_hits_for_read)
87 {
88 int right_file = curr_file + 1;
89
90 HitStream& right = contig_hits[right_file];
91 uint64_t curr_next_group_id = targets.insert_id;
92 int curr_order = unmapped_reads.observation_order(curr_next_group_id);
93
94 assert (curr_order != -1);
95 while(true)
96 {
97 HitsForRead hit_group;
98 uint64_t right_next_group_id = right.next_group_id();
99 int right_order = unmapped_reads.observation_order(right_next_group_id);
100
101 // If we would have seen the hits by now, bail out.
102 if (curr_order < right_order || right_order == -1)
103 {
104 break;
105 }
106 if (right.next_read_hits(hit_group))
107 {
108 if (hit_group.insert_id == targets.insert_id)
109 {
110 // Some of the targets may be missing, we need to
111 // process them individually
112 seg_hits_for_read[right_file] = hit_group;
113 break;
114 }
115 }
116 }
117
118 HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
119
120 if (right_file < (int)spliced_hits.size() && right_file >= 0)
121 {
122 // Scan forward in the spliced hits file for this hit group
123 HitsForRead spliced_group;
124 HitsForRead curr_spliced_group;
125 while (spliced_hits[right_file].next_group_id() > 0 &&
126 spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
127 {
128 spliced_hits[right_file].next_read_hits(curr_spliced_group);
129
130 if (curr_spliced_group.insert_id == (uint32_t)curr_order)
131 {
132 spliced_group = curr_spliced_group;
133 break;
134 }
135 }
136
137 if (!spliced_group.hits.empty())
138 {
139 curr_seg_hits.insert_id = spliced_group.insert_id;
140 curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
141 spliced_group.hits.begin(),
142 spliced_group.hits.end());
143 }
144 }
145
146 if (curr_seg_hits.hits.empty())
147 return;
148 else if (right_file + 1 < (int)contig_hits.size())
149 {
150 look_right_for_hit_group(unmapped_reads,
151 contig_hits,
152 curr_file + 1,
153 spliced_hits,
154 curr_seg_hits,
155 seg_hits_for_read);
156 }
157 }
158
159 BowtieHit merge_chain(RefSequenceTable& rt,
160 const string& read_seq,
161 const string& read_quals,
162 std::set<Junction>& possible_juncs,
163 std::set<Insertion>& possible_insertions,
164 list<BowtieHit>& hit_chain)
165 {
166 bool antisense = hit_chain.front().antisense_align();
167 uint32_t reference_id = hit_chain.front().ref_id();
168 uint64_t insert_id = hit_chain.front().insert_id();
169
170 int left = hit_chain.front().left();
171
172 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
173 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
174
175 string seq;
176 string qual;
177 int old_read_length = 0;
178 int first_seg_length = hit_chain.front().seq().length();
179 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
180 {
181 seq += i->seq();
182 qual += i->qual();
183 old_read_length += i->read_len();
184 }
185
186 string rev_read_seq, rev_read_quals;
187 if (color && antisense)
188 {
189 rev_read_seq = read_seq;
190 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
191
192 rev_read_quals = read_quals;
193 reverse(rev_read_quals.begin(), rev_read_quals.end());
194 }
195
196 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
197 {
198 /*
199 * Note that the gap size may be negative, since left() and right() return
200 * signed integers, this will be OK.
201 */
202 int gap = curr_hit->left() - prev_hit->right();
203 if (gap < -(int)max_insertion_length ||
204 (gap > (int)max_deletion_length &&
205 (gap < min_report_intron_length || gap > max_report_intron_length)))
206 {
207 return BowtieHit();
208 }
209
210 ++prev_hit;
211 ++curr_hit;
212 }
213
214 prev_hit = hit_chain.begin();
215 curr_hit = ++(hit_chain.begin());
216
217 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
218 if (!ref_str)
219 return BowtieHit();
220
221 int curr_seg_index = 1;
222 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
223 {
224 /*
225 * This code is assuming that the cigar strings end and start with a match
226 * While segment alignments can actually end with a junction, insertion or deletion, the hope
227 * is that in those cases, the right and left ends of the alignments will correctly
228 * line up, so we won't get to this bit of code
229 */
230 if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
231 {
232 return BowtieHit();
233 }
234
235 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
236 {
237 /*
238 * There is no way that we can splice these together into a valid
239 * alignment
240 */
241 return BowtieHit();
242 }
243
244 bool found_closure = false;
245 /*
246 * Note that antisense_splice is the same for prev and curr
247 */
248 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
249 vector<CigarOp> new_cigar;
250 int new_left = -1;
251 int mismatch = 0;
252
253 /*
254 * Take the length of matched bases in each segment into consideration for closures,
255 * this can be a problem for reads of variable lengths.
256 */
257 int prev_right_end_match_length = prev_hit->cigar().back().length;
258 int curr_left_end_match_length = curr_hit->cigar().front().length;
259
260 if (prev_hit->right() > curr_hit->left())
261 {
262 std::set<Insertion>::iterator lb, ub;
263 /*
264 * Note, this offset is determined by the min-anchor length supplied to
265 * juncs_db, which is currently hard-coded at 3 in tophat.py
266 * as this value determines what sort of read segments
267 * should be able to align directly to the splice sequences
268 */
269 int left_boundary = prev_hit->right() - 4;
270 int right_boundary = curr_hit->left() + 4;
271
272 /*
273 * Create a dummy sequence to represent the maximum possible insertion
274 */
275 std::string maxInsertedSequence = "";
276 maxInsertedSequence.resize(max_insertion_length,'A');
277
278 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
279 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
280
281 int reference_mismatch = 0;
282
283 while (lb != ub && lb != possible_insertions.end())
284 {
285 /*
286 * In the following code, we will check to make sure that the segments have the proper
287 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
288 * In general, reads with insertions must match the inserted sequence exactly.
289 */
290 if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
291 {
292 /*
293 * Check we have enough matched bases on prev or curr segment.
294 */
295 int insert_to_prev_right = prev_hit->right() - lb->left - 1;
296 int curr_left_to_insert = lb->left - curr_hit->left() + 1;
297 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
298 {
299 ++lb;
300 continue;
301 }
302
303 /*
304 * Keep track of how many mismatches were made to the genome in the region
305 * where we should actually be matching against the insertion
306 */
307 int this_reference_mismatch = 0;
308 int insertion_mismatch = 0;
309 int insertion_len = lb->sequence.length();
310 const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
311
312 /*
313 * First check to see if we need to adjust number of observed errors for the left (prev)
314 * hit. This is only the case if this segment runs into the insertion. To be consistent
315 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
316 */
317 string colorSegmentSequence_prev;
318 if (insert_to_prev_right > 0)
319 {
320 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
321 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
322
323 if (color)
324 {
325 string color;
326
327 if (antisense)
328 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
329 else
330 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
331
332 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
333 colorSegmentSequence_prev = convert_color_to_bp(color);
334 }
335
336 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
337
338 /*
339 * Scan right in the read until we run out of read
340 */
341 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
342 {
343 /*
344 * Any mismatch to the insertion is a failure
345 */
346 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
347 {
348 ++this_reference_mismatch;
349 }
350
351 if (read_index < insertion_len)
352 {
353 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
354 {
355 ++insertion_mismatch;
356 break;
357 }
358 }
359 else
360 {
361 if (referenceSequence[read_index - insertion_len] == 'N' ||
362 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
363 {
364 --this_reference_mismatch;
365 }
366 }
367 }
368 }
369
370 string colorSegmentSequence_curr;
371 if (curr_left_to_insert > 0)
372 {
373 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
374 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
375
376 if (color)
377 {
378 string color;
379 if (antisense)
380 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
381 else
382 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
383
384 color.push_back(curr_hit->seq()[curr_left_to_insert]);
385 reverse(color.begin(), color.end());
386 string bp = convert_color_to_bp(color);
387 reverse(bp.begin(), bp.end());
388 colorSegmentSequence_curr = bp;
389 }
390
391 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
392
393 /*
394 * Scan left in the read until
395 * We ran out of read sequence (insertion extends past segment)
396 */
397 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
398 {
399 int segmentPosition = curr_left_to_insert - read_index - 1;
400 int insertionPosition = insertion_len - read_index - 1;
401
402 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
403 {
404 ++this_reference_mismatch;
405 }
406
407 if (read_index < insertion_len)
408 {
409 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
410 {
411 ++insertion_mismatch;
412 break;
413 }
414 }
415 else
416 {
417 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
418 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
419 {
420 --this_reference_mismatch;
421 }
422 }
423 }
424 }
425
426 if (found_closure)
427 {
428 fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
429 return BowtieHit();
430 }
431
432 if (insertion_mismatch == 0)
433 {
434 reference_mismatch = this_reference_mismatch;
435 mismatch = -reference_mismatch;
436 found_closure = true;
437 new_left = prev_hit->left();
438 new_cigar = prev_hit->cigar();
439
440 /*
441 * Need to make a new insert operation between the two match character that begin
442 * and end the intersection of these two junction. Note that we necessarily assume
443 * that this insertion can't span beyond the boundaries of these reads. That should
444 * probably be better enforced somewhere
445 */
446
447 new_cigar.back().length -= insert_to_prev_right;
448 if (new_cigar.back().length <= 0)
449 new_cigar.pop_back();
450
451 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
452 vector<CigarOp> new_right_cigar = curr_hit->cigar();
453 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
454
455 /*
456 * Finish stitching together the new cigar string
457 */
458 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
459 for (; c < new_right_cigar.size(); ++c)
460 {
461 new_cigar.push_back(new_right_cigar[c]);
462 }
463
464 if (color)
465 {
466 if (insert_to_prev_right > 0)
467 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
468
469 if (curr_left_to_insert > 0)
470 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
471 }
472 }
473 }
474 ++lb;
475 }
476
477 if (!found_closure)
478 {
479 return BowtieHit();
480 }
481 }
482
483 /*
484 * Stitch segments together using juctions or deletions if necessary.
485 */
486 else if (prev_hit->right() < curr_hit->left())
487 {
488 std::set<Junction>::iterator lb, ub;
489
490 int left_boundary = prev_hit->right() - 4;
491 int right_boundary = curr_hit->left() + 4;
492
493 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
494 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
495
496 int new_diff_mismatches = 0xff;
497 while (lb != ub && lb != possible_juncs.end())
498 {
499 int dist_to_left = lb->left - prev_hit->right() + 1;
500 int dist_to_right = lb->right - curr_hit->left();
501
502 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
503 {
504 /*
505 * Check we have enough matched bases on prev or curr segment.
506 */
507 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
508 {
509 ++lb;
510 continue;
511 }
512
513 Dna5String new_cmp_str, old_cmp_str;
514 int new_mismatch = 0, old_mismatch = 0;
515 string new_patch_str; // this is for colorspace reads
516 if (dist_to_left > 0)
517 {
518 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
519 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
520
521 string new_seq;
522 if (color)
523 {
524 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
525
526 string color, qual;
527 if (antisense)
528 {
529 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
530 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
531 }
532 else
533 {
534 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
535 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
536 }
537
538 BWA_decode(color, qual, ref, new_seq);
539 new_seq = new_seq.substr(1);
540 }
541
542 const string& curr_old_seq = curr_hit->seq();
543 const string& curr_seq = color ? new_seq : curr_hit->seq();
544 for (int i = 0; i < dist_to_left; ++i)
545 {
546 if (curr_seq[i] != new_cmp_str[i])
547 ++new_mismatch;
548
549 if (curr_old_seq[i] != old_cmp_str[i])
550 ++old_mismatch;
551 }
552
553 if (color)
554 new_patch_str = curr_seq.substr(0, dist_to_left);
555 }
556 else if (dist_to_left < 0)
557 {
558 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
559 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
560
561 size_t abs_dist = -dist_to_left;
562 string new_seq;
563 if (color)
564 {
565 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
566 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
567
568 string color, qual;
569 if (antisense)
570 {
571 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
572 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
573 }
574 else
575 {
576 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
577 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
578 }
579
580 BWA_decode(color, qual, ref, new_seq);
581 new_seq = new_seq.substr(1);
582 }
583
584 const string& prev_old_seq = prev_hit->seq();
585 size_t prev_old_seq_len = prev_old_seq.length();
586 const string& prev_seq = color ? new_seq : prev_hit->seq();
587 size_t prev_seq_len = prev_seq.length();
588 for (size_t i = 0; i < abs_dist; ++i)
589 {
590 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
591 ++new_mismatch;
592 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
593 ++old_mismatch;
594 }
595
596 if (color)
597 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
598 }
599
600 int temp_diff_mismatches = new_mismatch - old_mismatch;
601 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
602 {
603 ++lb;
604 continue;
605 }
606
607 if (color)
608 {
609 /*
610 * We need to recover the origianl sequence.
611 */
612 if (found_closure)
613 {
614 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
615 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
616 }
617
618 if (dist_to_left > 0)
619 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
620 else if (dist_to_left < 0)
621 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
622 }
623
624 new_diff_mismatches = temp_diff_mismatches;
625
626 new_left = prev_hit->left();
627 new_cigar = prev_hit->cigar();
628
629 int new_left_back_len = new_cigar.back().length;
630 new_left_back_len += dist_to_left;
631
632 vector<CigarOp> new_right_cig = curr_hit->cigar();
633 int new_right_front_len = new_right_cig.front().length;
634 new_right_front_len -= dist_to_right;
635 if (new_left_back_len > 0)
636 new_cigar.back().length = new_left_back_len;
637 else
638 new_cigar.pop_back();
639
640 /*
641 * FIXME, currently just differentiating between a deletion and a
642 * reference skip based on length. However, would probably be better
643 * to denote the difference explicitly, this would allow the user
644 * to supply their own (very large) deletions
645 */
646 if ((lb->right - lb->left - 1) <= max_deletion_length)
647 {
648 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
649 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
650 }
651 else
652 {
653 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
654 antisense_closure = lb->antisense;
655 }
656
657 new_right_cig.front().length = new_right_front_len;
658 size_t c = new_right_front_len > 0 ? 0 : 1;
659 for (; c < new_right_cig.size(); ++c)
660 new_cigar.push_back(new_right_cig[c]);
661
662 mismatch = new_diff_mismatches;
663 found_closure = true;
664 }
665 ++lb;
666 }
667
668 if (!found_closure)
669 {
670 return BowtieHit();
671 }
672 }
673
674 if (found_closure)
675 {
676 bool end = false;
677 BowtieHit merged_hit(reference_id,
678 insert_id,
679 new_left,
680 new_cigar,
681 antisense,
682 antisense_closure,
683 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
684 prev_hit->splice_mms() + curr_hit->splice_mms(),
685 end);
686
687 if (curr_seg_index > 1)
688 merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
689 else
690 merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
691
692 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
693 /*
694 * prev_hit now points PAST the last element removed
695 */
696 prev_hit = hit_chain.insert(prev_hit, merged_hit);
697 /*
698 * merged_hit has been inserted before the old position of
699 * prev_hit. New location of prev_hit is merged_hit
700 */
701 curr_hit = prev_hit;
702 ++curr_hit;
703 ++curr_seg_index;
704 continue;
705 }
706
707 ++prev_hit;
708 ++curr_hit;
709 ++curr_seg_index;
710 }
711
712 bool saw_antisense_splice = false;
713 bool saw_sense_splice = false;
714 vector<CigarOp> long_cigar;
715 int num_mismatches = 0;
716 int num_splice_mms = 0;
717 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
718 {
719 num_mismatches += s->edit_dist();
720 num_splice_mms += s->splice_mms();
721
722 /*
723 * Check whether the sequence contains any reference skips. Previously,
724 * this was just a check to see whether the sequence was contiguous; however
725 * we don't want to count an indel event as a splice
726 */
727 bool containsSplice = s->is_spliced();
728 if (containsSplice)
729 {
730 if (s->antisense_splice())
731 {
732 if (saw_sense_splice)
733 return BowtieHit();
734 saw_antisense_splice = true;
735 }
736 else
737 {
738 if (saw_antisense_splice)
739 return BowtieHit();
740 saw_sense_splice = true;
741 }
742 }
743 const vector<CigarOp>& cigar = s->cigar();
744 if (long_cigar.empty())
745 {
746 long_cigar = cigar;
747 }
748 else
749 {
750 CigarOp& last = long_cigar.back();
751 /*
752 * If necessary, merge the back and front
753 * cigar operations
754 */
755 if(last.opcode == cigar[0].opcode){
756 last.length += cigar[0].length;
757 for (size_t b = 1; b < cigar.size(); ++b)
758 {
759 long_cigar.push_back(cigar[b]);
760 }
761 }else{
762 for(size_t b = 0; b < cigar.size(); ++b)
763 {
764 long_cigar.push_back(cigar[b]);
765 }
766 }
767 }
768 }
769
770 bool end = false;
771 BowtieHit new_hit(reference_id,
772 insert_id,
773 left,
774 long_cigar,
775 antisense,
776 saw_antisense_splice,
777 num_mismatches,
778 num_splice_mms,
779 end);
780
781 new_hit.seq(seq);
782 new_hit.qual(qual);
783
784 int new_read_len = new_hit.read_len();
785 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
786 {
787 fprintf(stderr, "Warning: malformed closure\n");
788 return BowtieHit();
789 }
790
791 return new_hit;
792 }
793
794 int multi_closure = 0;
795 int anchor_too_short = 0;
796 int gap_too_short = 0;
797
798 bool valid_hit(const BowtieHit& bh)
799 {
800 if (bh.insert_id())
801 {
802 /*
803 * validate the cigar chain - no gaps shorter than an intron, etc.
804 * also,
805 * -Don't start or end with an indel or refskip
806 * -Only a match operation is allowed is allowed
807 * adjacent to an indel or refskip
808 * -Indels should confrom to length restrictions
809 */
810 const CigarOp* prevCig = &(bh.cigar()[0]);
811 const CigarOp* currCig = &(bh.cigar()[1]);
812 for (size_t i = 1; i < bh.cigar().size(); ++i){
813 currCig = &(bh.cigar()[i]);
814 if(currCig->opcode != MATCH && prevCig->opcode != MATCH){
815 return false;
816 }
817 if(currCig->opcode == INS){
818 if(currCig->length > max_insertion_length){
819 return false;
820 }
821 }
822 if(currCig->opcode == DEL){
823 if(currCig->length > max_deletion_length){
824 return false;
825 }
826 }
827 if(currCig->opcode == REF_SKIP){
828 if(currCig->length < (uint64_t)min_report_intron_length){
829 gap_too_short++;
830 return false;
831 }
832 }
833 prevCig = currCig;
834 }
835 if (bh.cigar().front().opcode != MATCH ||
836 bh.cigar().back().opcode != MATCH /* ||
837 (int)bh.cigar().front().length < min_anchor_len||
838 (int)bh.cigar().back().length < min_anchor_len*/ )
839 {
840 anchor_too_short++;
841 return false;
842 }
843 }
844 else
845 {
846 multi_closure++;
847 return false;
848 }
849
850 return true;
851 }
852
853 void merge_segment_chain(RefSequenceTable& rt,
854 const string& read_seq,
855 const string& read_quals,
856 std::set<Junction>& possible_juncs,
857 std::set<Insertion>& possible_insertions,
858 vector<BowtieHit>& hits,
859 vector<BowtieHit>& merged_hits)
860 {
861 if (hits.size() == 0)
862 return;
863
864 BowtieHit bh;
865 if (hits.size() > 1)
866 {
867 list<BowtieHit> hit_chain;
868
869 if (hits.front().antisense_align())
870 copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
871 else
872 copy(hits.begin(), hits.end(), back_inserter(hit_chain));
873
874 bh = merge_chain(rt, read_seq, read_quals, possible_juncs, possible_insertions, hit_chain);
875 }
876 else
877 {
878 bh = hits[0];
879 }
880
881 if (valid_hit(bh))
882 merged_hits.push_back(bh);
883 }
884
885 bool dfs_seg_hits(RefSequenceTable& rt,
886 const string& read_seq,
887 const string& read_quals,
888 std::set<Junction>& possible_juncs,
889 std::set<Insertion>& possible_insertions,
890 vector<HitsForRead>& seg_hits_for_read,
891 size_t curr,
892 vector<BowtieHit>& seg_hit_stack,
893 vector<BowtieHit>& joined_hits)
894 {
895 assert (!seg_hit_stack.empty());
896 bool join_success = false;
897
898 if (curr < seg_hits_for_read.size())
899 {
900 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
901 {
902 BowtieHit& bh = seg_hits_for_read[curr].hits[i];
903 BowtieHit& back = seg_hit_stack.back();
904
905 bool consistent_sense = bh.antisense_align() == back.antisense_align();
906 bool same_contig = bh.ref_id() == back.ref_id();
907
908 // FIXME: when we have stranded reads, we need to fix this condition
909 //bool consistent_strand = (bh.contiguous() || back.contiguous() ||
910 // (bh.antisense_splice() == back.antisense_splice()));
911 if (consistent_sense && same_contig /*&& consistent_strand*/)
912 {
913 if (bh.antisense_align())
914 {
915 unsigned int bh_r = bh.right();
916 unsigned int back_left = seg_hit_stack.back().left();
917 if ((bh_r + max_report_intron_length >= back_left &&
918 back_left >= bh_r) || (back_left + max_insertion_length >= bh_r && back_left < bh_r))
919 {
920 // these hits are compatible, so push bh onto the
921 // stack, recurse, and pop it when done.
922 seg_hit_stack.push_back(bh);
923 bool success = dfs_seg_hits(rt,
924 read_seq,
925 read_quals,
926 possible_juncs,
927 possible_insertions,
928 seg_hits_for_read,
929 curr + 1,
930 seg_hit_stack,
931 joined_hits);
932 if (success)
933 join_success = true;
934
935 seg_hit_stack.pop_back();
936 }
937 }
938 else
939 {
940 unsigned int bh_l = bh.left();
941
942 unsigned int back_right = seg_hit_stack.back().right();
943 if ((back_right + max_report_intron_length >= bh_l &&
944 bh_l >= back_right) || (bh_l + max_insertion_length >= back_right && bh_l < back_right))
945 {
946 // these hits are compatible, so push bh onto the
947 // stack, recurse, and pop it when done.
948 seg_hit_stack.push_back(bh);
949 bool success = dfs_seg_hits(rt,
950 read_seq,
951 read_quals,
952 possible_juncs,
953 possible_insertions,
954 seg_hits_for_read,
955 curr + 1,
956 seg_hit_stack,
957 joined_hits);
958
959 if (success)
960 join_success = true;
961
962 seg_hit_stack.pop_back();
963 }
964 }
965 }
966 }
967 }
968 else
969 {
970 merge_segment_chain(rt,
971 read_seq,
972 read_quals,
973 possible_juncs,
974 possible_insertions,
975 seg_hit_stack,
976 joined_hits);
977 return join_success = true;
978 }
979 return join_success;
980 }
981
982 bool join_segments_for_read(RefSequenceTable& rt,
983 const string& read_seq,
984 const string& read_quals,
985 std::set<Junction>& possible_juncs,
986 std::set<Insertion>& possible_insertions,
987 vector<HitsForRead>& seg_hits_for_read,
988 vector<BowtieHit>& joined_hits)
989 {
990 vector<BowtieHit> seg_hit_stack;
991 bool join_success = false;
992
993 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
994 {
995 BowtieHit& bh = seg_hits_for_read[0].hits[i];
996 seg_hit_stack.push_back(bh);
997 bool success = dfs_seg_hits(rt,
998 read_seq,
999 read_quals,
1000 possible_juncs,
1001 possible_insertions,
1002 seg_hits_for_read,
1003 1,
1004 seg_hit_stack,
1005 joined_hits);
1006 if (success)
1007 join_success = true;
1008 seg_hit_stack.pop_back();
1009 }
1010
1011 return join_success;
1012 }
1013
1014 void join_segment_hits(GBamWriter& bam_writer, std::set<Junction>& possible_juncs,
1015 std::set<Insertion>& possible_insertions,
1016 ReadTable& unmapped_reads,
1017 RefSequenceTable& rt,
1018 FILE* reads_file,
1019 vector<HitStream>& contig_hits,
1020 vector<HitStream>& spliced_hits)
1021 {
1022 uint32_t curr_contig_obs_order = 0xFFFFFFFF;
1023 HitStream* first_seg_contig_stream = NULL;
1024 uint64_t next_contig_id = 0;
1025
1026 if (contig_hits.size())
1027 {
1028 first_seg_contig_stream = &(contig_hits.front());
1029 next_contig_id = first_seg_contig_stream->next_group_id();
1030 curr_contig_obs_order = unmapped_reads.observation_order(next_contig_id);
1031 }
1032
1033 HitsForRead curr_hit_group;
1034
1035 uint32_t curr_spliced_obs_order = 0xFFFFFFFF;
1036 HitStream* first_seg_spliced_stream = NULL;
1037 uint64_t next_spliced_id = 0;
1038
1039 if (spliced_hits.size())
1040 {
1041 first_seg_spliced_stream = &(spliced_hits.front());
1042 next_spliced_id = first_seg_spliced_stream->next_group_id();
1043 curr_spliced_obs_order = unmapped_reads.observation_order(next_spliced_id);
1044 }
1045
1046 while(curr_contig_obs_order != 0xFFFFFFFF ||
1047 curr_spliced_obs_order != 0xFFFFFFFF)
1048 {
1049 uint32_t read_in_process;
1050 vector<HitsForRead> seg_hits_for_read;
1051 seg_hits_for_read.resize(contig_hits.size());
1052
1053 if (curr_contig_obs_order < curr_spliced_obs_order)
1054 {
1055 first_seg_contig_stream->next_read_hits(curr_hit_group);
1056 seg_hits_for_read.front() = curr_hit_group;
1057
1058 next_contig_id = first_seg_contig_stream->next_group_id();
1059 uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1060
1061 read_in_process = curr_contig_obs_order;
1062 curr_contig_obs_order = next_order;
1063 }
1064 else if (curr_spliced_obs_order < curr_contig_obs_order)
1065 {
1066 first_seg_spliced_stream->next_read_hits(curr_hit_group);
1067 seg_hits_for_read.front() = curr_hit_group;
1068
1069 next_spliced_id = first_seg_spliced_stream->next_group_id();
1070 uint32_t next_order = unmapped_reads.observation_order(next_spliced_id);
1071
1072 read_in_process = curr_spliced_obs_order;
1073 curr_spliced_obs_order = next_order;
1074 }
1075 else if (curr_contig_obs_order == curr_spliced_obs_order &&
1076 curr_contig_obs_order != 0xFFFFFFFF &&
1077 curr_spliced_obs_order != 0xFFFFFFFF)
1078 {
1079 first_seg_contig_stream->next_read_hits(curr_hit_group);
1080
1081 HitsForRead curr_spliced_group;
1082 first_seg_spliced_stream->next_read_hits(curr_spliced_group);
1083
1084 curr_hit_group.hits.insert(curr_hit_group.hits.end(),
1085 curr_spliced_group.hits.begin(),
1086 curr_spliced_group.hits.end());
1087 seg_hits_for_read.front() = curr_hit_group;
1088 read_in_process = curr_spliced_obs_order;
1089
1090 next_contig_id = first_seg_contig_stream->next_group_id();
1091 uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1092
1093 next_spliced_id = first_seg_spliced_stream->next_group_id();
1094 uint32_t next_spliced_order = unmapped_reads.observation_order(next_spliced_id);
1095
1096 curr_spliced_obs_order = next_spliced_order;
1097 curr_contig_obs_order = next_order;
1098 }
1099 else
1100 {
1101 break;
1102 }
1103
1104 if (contig_hits.size() > 1)
1105 {
1106 look_right_for_hit_group(unmapped_reads,
1107 contig_hits,
1108 0,
1109 spliced_hits,
1110 curr_hit_group,
1111 seg_hits_for_read);
1112 }
1113
1114 size_t last_non_empty = seg_hits_for_read.size() - 1;
1115 while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
1116 {
1117 --last_non_empty;
1118 }
1119
1120 seg_hits_for_read.resize(last_non_empty + 1);
1121 if (!seg_hits_for_read[last_non_empty].hits[0].end())
1122 continue;
1123
1124 if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
1125 {
1126 uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
1127 char read_name[256];
1128 char read_seq[256];
1129 char read_alt_name[256];
1130 char read_quals[256];
1131
1132 if (get_read_from_stream(insert_id,
1133 reads_file,
1134 reads_format,
1135 false,
1136 read_name,
1137 read_seq,
1138 read_alt_name,
1139 read_quals))
1140 {
1141 vector<BowtieHit> joined_hits;
1142 join_segments_for_read(rt,
1143 read_seq,
1144 read_quals,
1145 possible_juncs,
1146 possible_insertions,
1147 seg_hits_for_read,
1148 joined_hits);
1149
1150 sort(joined_hits.begin(), joined_hits.end());
1151 vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
1152 joined_hits.erase(new_end, joined_hits.end());
1153
1154 for (size_t i = 0; i < joined_hits.size(); i++)
1155 {
1156 const char* ref_name = rt.get_name(joined_hits[i].ref_id());
1157 if (color && !color_out)
1158 //print_hit(stdout, read_name, joined_hits[i], ref_name, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
1159 print_bamhit(bam_writer, read_name, joined_hits[i], ref_name, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
1160 else
1161 print_bamhit(bam_writer, read_name, joined_hits[i], ref_name, read_seq, read_quals, false);
1162 //print_hit(stdout, read_name, joined_hits[i], ref_name, read_seq, read_quals, false);
1163 }
1164 }
1165 else
1166 {
1167 err_die("Error: could not get read # %d from stream\n",
1168 read_in_process);
1169 }
1170 }
1171 else
1172 {
1173 //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
1174 }
1175 }
1176 }
1177
1178 void driver(GBamWriter& bam_writer, istream& ref_stream,
1179 vector<FILE*>& possible_juncs_files,
1180 vector<FILE*>& possible_insertions_files,
1181 vector<FILE*>& possible_deletions_files,
1182 vector<FZPipe>& spliced_seg_files,
1183 vector<FZPipe>& seg_files,
1184 FZPipe& reads_file)
1185 {
1186 if (seg_files.size() == 0)
1187 {
1188 fprintf(stderr, "No hits to process, exiting\n");
1189 exit(0);
1190 }
1191
1192 RefSequenceTable rt(true, true);
1193 fprintf (stderr, "Loading reference sequences...\n");
1194 get_seqs(ref_stream, rt, true, false);
1195 fprintf (stderr, " reference sequences loaded.\n");
1196 ReadTable it;
1197
1198 bool need_seq = true;
1199 bool need_qual = color;
1200 //rewind(reads_file);
1201 reads_file.rewind();
1202
1203 //vector<HitTable> seg_hits;
1204 vector<HitStream> contig_hits;
1205 vector<HitStream> spliced_hits;
1206
1207 vector<HitFactory*> factories;
1208 for (size_t i = 0; i < seg_files.size(); ++i)
1209 {
1210 HitFactory* fac = new BowtieHitFactory(it, rt);
1211 factories.push_back(fac);
1212 HitStream hs(seg_files[i].file, fac, false, false, false, need_seq, need_qual);
1213 contig_hits.push_back(hs);
1214 }
1215
1216 fprintf(stderr, "Loading spliced hits...");
1217
1218 //vector<vector<pair<uint32_t, HitsForRead> > > spliced_hits;
1219 for (size_t i = 0; i < spliced_seg_files.size(); ++i)
1220 {
1221 int anchor_length = 0;
1222
1223 // we allow read alignments even 1bp overlapping with juctions.
1224 // if (i == 0 || i == spliced_seg_files.size() - 1)
1225 // anchor_length = min_anchor_len;
1226
1227 HitFactory* fac = new SplicedBowtieHitFactory(it,
1228 rt,
1229 anchor_length);
1230 factories.push_back(fac);
1231
1232 HitStream hs(spliced_seg_files[i].file, fac, true, false, false, need_seq, need_qual);
1233 spliced_hits.push_back(hs);
1234
1235 }
1236 fprintf(stderr, "done\n");
1237
1238 fprintf(stderr, "Loading junctions...");
1239 std::set<Junction> possible_juncs;
1240
1241 for (size_t i = 0; i < possible_juncs_files.size(); ++i)
1242 {
1243 char buf[2048];
1244 while(!feof(possible_juncs_files[i]) &&
1245 fgets(buf, sizeof(buf), possible_juncs_files[i]))
1246 {
1247 char junc_ref_name[256];
1248 int left;
1249 int right;
1250 char orientation;
1251 int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
1252 if (ret != 4)
1253 continue;
1254 uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
1255 possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
1256 }
1257 }
1258 fprintf(stderr, "done\n");
1259
1260
1261 fprintf(stderr, "Loading deletions...");
1262
1263 for (size_t i = 0; i < possible_deletions_files.size(); ++i)
1264 {
1265 char splice_buf[2048];
1266 FILE* deletions_file = possible_deletions_files[i];
1267 if(!deletions_file){
1268 continue;
1269 }
1270 while(fgets(splice_buf, 2048, deletions_file)){
1271 char* nl = strrchr(splice_buf, '\n');
1272 char* buf = splice_buf;
1273 if (nl) *nl = 0;
1274
1275 char* ref_name = get_token((char**)&buf, "\t");
1276 char* scan_left_coord = get_token((char**)&buf, "\t");
1277 char* scan_right_coord = get_token((char**)&buf, "\t");
1278
1279 if (!scan_left_coord || !scan_right_coord)
1280 {
1281 err_die("Error: malformed deletion coordinate record\n");
1282 }
1283
1284 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
1285 uint32_t left_coord = atoi(scan_left_coord);
1286 uint32_t right_coord = atoi(scan_right_coord);
1287 possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
1288 }
1289 }
1290 fprintf(stderr, "done\n");
1291
1292
1293 /*
1294 * Read the insertions from the list of insertion
1295 * files into a set
1296 */
1297 std::set<Insertion> possible_insertions;
1298 for (size_t i=0; i < possible_insertions_files.size(); ++i)
1299 {
1300 char splice_buf[2048];
1301 FILE* insertions_file = possible_insertions_files[i];
1302 if(!insertions_file){
1303 continue;
1304 }
1305 while(fgets(splice_buf, 2048, insertions_file)){
1306 char* nl = strrchr(splice_buf, '\n');
1307 char* buf = splice_buf;
1308 if (nl) *nl = 0;
1309
1310 char* ref_name = get_token((char**)&buf, "\t");
1311 char* scan_left_coord = get_token((char**)&buf, "\t");
1312 char* scan_right_coord = get_token((char**)&buf, "\t");
1313 char* scan_sequence = get_token((char**)&buf, "\t");
1314
1315 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
1316 {
1317 err_die("Error: malformed insertion coordinate record\n");
1318 }
1319
1320 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
1321 uint32_t left_coord = atoi(scan_left_coord);
1322 std::string sequence(scan_sequence);
1323 possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
1324 }
1325 }
1326
1327 join_segment_hits(bam_writer, possible_juncs, possible_insertions, it, rt, reads_file.file, contig_hits, spliced_hits);
1328 //join_segment_hits(possible_juncs, possible_insertions, it, rt, reads_file.file, contig_hits, spliced_hits);
1329
1330 for (size_t i = 0; i < seg_files.size(); ++i)
1331 seg_files[i].close();
1332 for (size_t i = 0; i < spliced_seg_files.size(); ++i)
1333 spliced_seg_files[i].close();
1334 reads_file.close();
1335
1336 for (size_t fac = 0; fac < factories.size(); ++fac)
1337 {
1338 delete factories[fac];
1339 }
1340 } //driver
1341
1342 int main(int argc, char** argv)
1343 {
1344 fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1345 fprintf(stderr, "--------------------------------------------\n");
1346
1347 int parse_ret = parse_options(argc, argv, print_usage);
1348 if (parse_ret)
1349 return parse_ret;
1350
1351 if(optind >= argc)
1352 {
1353 print_usage();
1354 return 1;
1355 }
1356
1357 string ref_file_name = argv[optind++];
1358
1359 if(optind >= argc)
1360 {
1361 print_usage();
1362 return 1;
1363 }
1364
1365
1366 string reads_file_name = argv[optind++];
1367
1368 if(optind >= argc)
1369 {
1370 print_usage();
1371 return 1;
1372 }
1373
1374 string juncs_file_list = argv[optind++];
1375
1376 if(optind >= argc)
1377 {
1378 print_usage();
1379 return 1;
1380 }
1381
1382 string insertions_file_list = argv[optind++];
1383 if(optind >= argc)
1384 {
1385 print_usage();
1386 return 1;
1387 }
1388
1389 string deletions_file_list = argv[optind++];
1390
1391 if(optind >= argc)
1392 {
1393 print_usage();
1394 return 1;
1395 }
1396
1397
1398 string segment_file_list = argv[optind++];
1399 string spliced_segment_file_list;
1400 if(optind < argc)
1401 {
1402 spliced_segment_file_list = argv[optind++];
1403 }
1404
1405 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1406 if (!ref_stream.good())
1407 err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
1408
1409 checkSamHeader();
1410
1411 //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
1412 vector<string> segment_file_names;
1413 tokenize(segment_file_list, ",",segment_file_names);
1414
1415 string unzcmd=getUnpackCmd(reads_file_name, spliced_segment_file_list.empty() &&
1416 segment_file_names.size()<4);
1417 FZPipe reads_file(reads_file_name, unzcmd);
1418 if (reads_file.file==NULL)
1419 err_die("Error: cannot open %s for reading\n",
1420 reads_file_name.c_str());
1421
1422 vector<string> juncs_file_names;
1423 vector<FILE*> juncs_files;
1424 tokenize(juncs_file_list, ",",juncs_file_names);
1425 for (size_t i = 0; i < juncs_file_names.size(); ++i) {
1426 //fprintf(stderr, "Opening %s for reading\n",
1427 // juncs_file_names[i].c_str());
1428 FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
1429 if (juncs_file == NULL) {
1430 fprintf(stderr, "Warning: cannot open %s for reading\n",
1431 juncs_file_names[i].c_str());
1432 continue;
1433 }
1434 juncs_files.push_back(juncs_file);
1435 }
1436
1437 /*
1438 * Read in the deletion file names
1439 */
1440 vector<string> deletions_file_names;
1441 vector<FILE*> deletions_files;
1442 tokenize(deletions_file_list, ",",deletions_file_names);
1443 for (size_t i = 0; i < deletions_file_names.size(); ++i)
1444 {
1445 //fprintf(stderr, "Opening %s for reading\n",
1446 // deletions_file_names[i].c_str());
1447 FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
1448 if (deletions_file == NULL) {
1449 fprintf(stderr, "Warning: cannot open %s for reading\n",
1450 deletions_file_names[i].c_str());
1451 continue;
1452 }
1453 deletions_files.push_back(deletions_file);
1454 }
1455
1456 /*
1457 * Read in the list of filenames that contain
1458 * insertion coordinates
1459 */
1460
1461 vector<string> insertions_file_names;
1462 vector<FILE*> insertions_files;
1463 tokenize(insertions_file_list, ",",insertions_file_names);
1464 for (size_t i = 0; i < insertions_file_names.size(); ++i)
1465 {
1466 //fprintf(stderr, "Opening %s for reading\n",
1467 // insertions_file_names[i].c_str());
1468 FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
1469 if (insertions_file == NULL)
1470 {
1471 fprintf(stderr, "Warning: cannot open %s for reading\n",
1472 insertions_file_names[i].c_str());
1473 continue;
1474 }
1475 insertions_files.push_back(insertions_file);
1476 }
1477
1478 //vector<FILE*> segment_files;
1479 vector<FZPipe> segment_files;
1480 for (size_t i = 0; i < segment_file_names.size(); ++i)
1481 {
1482 fprintf(stderr, "Opening %s for reading\n",
1483 segment_file_names[i].c_str());
1484 //FILE* seg_file = fopen(segment_file_names[i].c_str(), "r");
1485 FZPipe seg_file(segment_file_names[i], unzcmd);
1486 if (seg_file.file == NULL)
1487 {
1488 err_die("Error: cannot open %s for reading\n",
1489 segment_file_names[i].c_str());
1490 }
1491 segment_files.push_back(seg_file);
1492 }
1493
1494 vector<string> spliced_segment_file_names;
1495 //vector<FILE*> spliced_segment_files;
1496 vector<FZPipe> spliced_segment_files;
1497 tokenize(spliced_segment_file_list, ",",spliced_segment_file_names);
1498 for (size_t i = 0; i < spliced_segment_file_names.size(); ++i)
1499 {
1500 fprintf(stderr, "Opening %s for reading\n",
1501 spliced_segment_file_names[i].c_str());
1502 //FILE* spliced_seg_file = fopen(spliced_segment_file_names[i].c_str(), "r");
1503 FZPipe spliced_seg_file(spliced_segment_file_names[i], unzcmd);
1504 if (spliced_seg_file.file == NULL)
1505 {
1506 err_die("Error: cannot open %s for reading\n",
1507 spliced_segment_file_names[i].c_str());
1508 }
1509 spliced_segment_files.push_back(spliced_seg_file);
1510 }
1511
1512 if (spliced_segment_files.size())
1513 {
1514 if (spliced_segment_files.size() != segment_files.size())
1515 {
1516 err_die("Error: each segment file must have a corresponding spliced segment file\n");
1517 }
1518 }
1519 GBamWriter bam_writer("-", sam_header.c_str());
1520 driver(bam_writer, ref_stream, juncs_files, insertions_files, deletions_files, spliced_segment_files, segment_files, reads_file);
1521 //driver(ref_stream, juncs_files, insertions_files, deletions_files, spliced_segment_files, segment_files, reads_file);
1522 return 0;
1523 }