ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
(Generate patch)
# Line 44 | Line 44
44   #include "wiggles.h"
45   #include "tokenize.h"
46   #include "reads.h"
47 + #include "coverage.h"
48  
49  
50   #include "inserts.h"
# Line 52 | Line 53
53   using namespace seqan;
54   using std::set;
55  
56 + static const JunctionSet empty_junctions;
57 + static const InsertionSet empty_insertions;
58 + static const DeletionSet empty_deletions;
59 + static const FusionSet empty_fusions;
60 + static const Coverage empty_coverage;
61 +
62   // daehwan - this is redundancy, which should be removed.
63   void get_seqs(istream& ref_stream,
64                RefSequenceTable& rt,
# Line 73 | Line 80
80      }
81   }
82  
83 + struct cmp_read_alignment
84 + {
85 +  bool operator() (const BowtieHit& l, const BowtieHit& r) const
86 +  {
87 +    return l.alignment_score() > r.alignment_score();
88 +  }
89 + };
90 +
91   void read_best_alignments(const HitsForRead& hits_for_read,
92 <                              FragmentAlignmentGrade& best_grade,
93 <                              HitsForRead& best_hits,
94 <                              const JunctionSet& gtf_junctions)
92 >                          HitsForRead& best_hits,
93 >                          const JunctionSet& gtf_junctions,
94 >                          const JunctionSet& junctions = empty_junctions,
95 >                          const InsertionSet& insertions = empty_insertions,
96 >                          const DeletionSet& deletions = empty_deletions,
97 >                          const FusionSet& fusions = empty_fusions,
98 >                          const Coverage& coverage = empty_coverage,
99 >                          bool final_report = false)
100   {
101    const vector<BowtieHit>& hits = hits_for_read.hits;
102 +
103 +  if (hits.size() >= max_multihits * 5)
104 +    return;
105 +
106    for (size_t i = 0; i < hits.size(); ++i)
107      {
108 <      if (hits[i].edit_dist()>max_read_mismatches) continue;
109 <      
110 <      FragmentAlignmentGrade g(hits[i], gtf_junctions);
111 <      
112 <      // Is the new status better than the current best one?
113 <      if (best_grade < g)
114 <      {
115 <        best_hits.hits.clear();
116 <        best_grade = g;
117 <        best_hits.hits.push_back(hits[i]);
118 <      }
119 <      else if (!(g < best_grade)) // is it just as good?
120 <      {
121 <        best_grade.num_alignments++;
122 <        best_hits.hits.push_back(hits[i]);
123 <      }
108 >      if (hits[i].edit_dist() > max_read_mismatches)
109 >        continue;
110 >
111 >      BowtieHit hit = hits[i];
112 >      AlignStatus align_status(hit, gtf_junctions,
113 >                               junctions, insertions, deletions, fusions, coverage);
114 >      hit.alignment_score(align_status._alignment_score);
115 >
116 >      if (report_secondary_alignments || !final_report)
117 >        {
118 >          best_hits.hits.push_back(hit);
119 >        }
120 >      else
121 >        {
122 >          // Is the new status better than the current best one?
123 >          if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
124 >            {
125 >              best_hits.hits.clear();
126 >              best_hits.hits.push_back(hit);
127 >            }
128 >          else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
129 >            {
130 >              best_hits.hits.push_back(hit);
131 >            }
132 >        }
133 >    }
134 >
135 >  if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0)
136 >    {
137 >      sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
138 >    }
139 >
140 >  if (final_report)
141 >    {
142 >      if (best_hits.hits.size() > max_multihits)
143 >        best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
144      }
145   }
146  
147 < void pair_best_alignments(const HitsForRead& left_hits,
104 <                          const HitsForRead& right_hits,
105 <                          InsertAlignmentGrade& best_grade,
106 <                          HitsForRead& left_best_hits,
107 <                          HitsForRead& right_best_hits)
147 > bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
148   {
149    // max mate inner distance (genomic)
150    int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
151    if (max_mate_inner_dist == -1)
152      max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
153    
154 +  bool fusion = false;
155 +  bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
156 +  bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
157 +  if (left_fusion && right_fusion)
158 +    return false;
159 +  
160 +  fusion = left_fusion || right_fusion;
161 +  if (!fusion && lh.ref_id() != rh.ref_id())
162 +    fusion = true;
163 +  
164 +  if (!fusion && lh.ref_id() == rh.ref_id())
165 +    {
166 +      if (lh.antisense_align() == rh.antisense_align())
167 +        fusion = true;
168 +      else
169 +        {
170 +          int inner_dist = 0;
171 +          if (lh.antisense_align())
172 +            // used rh.left() instead of rh.right() for the cases,
173 +            // where reads overlap with each other or reads span introns
174 +            inner_dist = lh.left() - rh.left();
175 +          else
176 +            inner_dist = rh.left() - lh.left();
177 +          
178 +          if (inner_dist < 0 || inner_dist > (int)fusion_min_dist)
179 +            fusion = true;
180 +        }
181 +    }
182 +
183 +  grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
184 +  
185 +  return true;
186 + }
187 +
188 + struct cmp_pair_alignment
189 + {
190 +  cmp_pair_alignment(const JunctionSet& gtf_juncs) :
191 +    _gtf_juncs(&gtf_juncs) {}
192 +    
193 +  bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
194 +  {
195 +    InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
196 +    InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
197 +
198 +    bool better = gr < gl;
199 +    bool worse = gl < gr;
200 +
201 +    if (better && !worse)
202 +      return true;
203 +    else        
204 +      return false;
205 +  }
206 +
207 +  const JunctionSet* _gtf_juncs;
208 + };
209 +
210 + void pair_best_alignments(const HitsForRead& left_hits,
211 +                          const HitsForRead& right_hits,
212 +                          InsertAlignmentGrade& best_grade,
213 +                          vector<pair<BowtieHit, BowtieHit> >& best_hits,
214 +                          const JunctionSet& gtf_junctions,
215 +                          const JunctionSet& junctions = empty_junctions,
216 +                          const InsertionSet& insertions = empty_insertions,
217 +                          const DeletionSet& deletions = empty_deletions,
218 +                          const FusionSet& fusions = empty_fusions,
219 +                          const Coverage& coverage = empty_coverage,
220 +                          bool final_report = false)
221 + {
222    const vector<BowtieHit>& left = left_hits.hits;
223    const vector<BowtieHit>& right = right_hits.hits;
224 <  
224 >
225 >  if (left.size() >= max_multihits * 5 || right.size() >= max_multihits * 5)
226 >    return;
227 >
228    for (size_t i = 0; i < left.size(); ++i)
229      {
230 <      const BowtieHit& lh = left[i];
231 <      if (lh.edit_dist() > max_read_mismatches) continue;
230 >      if (left[i].edit_dist() > max_read_mismatches) continue;
231 >
232 >      BowtieHit lh = left[i];
233 >      AlignStatus align_status(lh, gtf_junctions,
234 >                               junctions, insertions, deletions, fusions, coverage);
235 >      lh.alignment_score(align_status._alignment_score);
236 >
237        for (size_t j = 0; j < right.size(); ++j)
238          {
239 <          const BowtieHit& rh = right[j];
240 <          if (rh.edit_dist() > max_read_mismatches) continue;
241 <
242 <          bool fusion = false;
243 <          if (fusion_search)
244 <            {
245 <              bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
246 <              bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
131 <              if (left_fusion && right_fusion)
132 <                continue;
133 <              
134 <              fusion = left_fusion || right_fusion;
135 <              if (!fusion && lh.ref_id() != rh.ref_id())
136 <                fusion = true;
137 <              
138 <              if (!fusion && lh.ref_id() == rh.ref_id())
139 <                {
140 <                  if (lh.antisense_align() == rh.antisense_align())
141 <                    fusion = true;
142 <                  else
143 <                    {
144 <                      int inter_dist = 0;
145 <                      if (lh.antisense_align())
146 <                        inter_dist = lh.left() - rh.right();
147 <                      else
148 <                        inter_dist = rh.left() - lh.right();
149 <                      
150 <                      if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
151 <                        fusion = true;
152 <                    }
153 <                }
154 <            }
239 >          if (right[j].edit_dist() > max_read_mismatches) continue;
240 >          
241 >          BowtieHit rh = right[j];
242 >          AlignStatus align_status(rh, gtf_junctions,
243 >                                   junctions, insertions, deletions, fusions, coverage);
244 >          rh.alignment_score(align_status._alignment_score);
245 >          InsertAlignmentGrade g;
246 >          bool allowed = set_insert_alignment_grade(lh, rh, g);
247  
248 <          InsertAlignmentGrade g(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
248 >          if (!allowed) continue;
249 >          if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue;
250  
251 <          // Is the new status better than the current best one?
159 <          if (best_grade < g)
251 >          if (report_secondary_alignments || !final_report)
252              {
253 <              left_best_hits.hits.clear();
162 <              right_best_hits.hits.clear();
163 <              
164 <              best_grade = g;
165 <              left_best_hits.hits.push_back(lh);
166 <              right_best_hits.hits.push_back(rh);
253 >              best_hits.push_back(make_pair(lh, rh));
254              }
255 <          else if (!(g < best_grade))
255 >          else
256              {
257 <              left_best_hits.hits.push_back(lh);
258 <              right_best_hits.hits.push_back(rh);
259 <              best_grade.num_alignments++;
257 >              // Is the new status better than the current best one?
258 >              if (best_grade < g)
259 >                {
260 >                  best_hits.clear();
261 >                  best_grade = g;
262 >                  best_hits.push_back(make_pair(lh, rh));
263 >                }
264 >              else if (!(g < best_grade))
265 >                {
266 >                  best_hits.push_back(make_pair(lh, rh));
267 >                }
268              }
269          }
270      }
271 +
272 +  if ((report_secondary_alignments || !final_report) && best_hits.size() > 0)
273 +    {
274 +      cmp_pair_alignment cmp(gtf_junctions);
275 +      sort(best_hits.begin(), best_hits.end(), cmp);
276 +      set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
277 +    }
278 +
279 +  if (final_report)
280 +    {
281 +      if (best_hits.size() > max_multihits)
282 +        best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
283 +    }
284 +  
285 +  best_grade.num_alignments = best_hits.size();
286   }
287  
288   enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
# Line 180 | Line 290
290   void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
291                   const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
292                   int num_hits, const BowtieHit* next_hit, int hitIndex) {
293 +  bool XS_found = false;
294 +  if (sam_toks.size()>11) {
295      
296 <    if (sam_toks.size()>11) {
297 <        for (size_t i=11;i<sam_toks.size();++i) {
298 <            if (sam_toks[i].find("NH:i:")==string::npos)
299 <                auxdata.push_back(sam_toks[i]);
300 <        }
301 <        }
302 <    string aux("NH:i:");
303 <    str_appendInt(aux, num_hits);
296 >    for (size_t i=11;i<sam_toks.size();++i) {
297 >      if (sam_toks[i].find("NH:i:")==string::npos &&
298 >          sam_toks[i].find("XS:i:")==string::npos)
299 >        auxdata.push_back(sam_toks[i]);
300 >
301 >      if (sam_toks[i].find("XS:A:")!=string::npos)
302 >        XS_found = true;
303 >    }
304 >  }
305 >  string aux("NH:i:");
306 >  str_appendInt(aux, num_hits);
307 >  auxdata.push_back(aux);
308 >  if (next_hit) {
309 >    const char* nh_ref_name = "=";
310 >    nh_ref_name = rt.get_name(next_hit->ref_id());
311 >    assert (nh_ref_name != NULL);
312 >    bool same_contig=(next_hit->ref_id()==bh.ref_id());
313 >    aux="CC:Z:";
314 >    aux+= (same_contig ? "=" : nh_ref_name);
315 >    auxdata.push_back(aux);
316 >    aux="CP:i:";
317 >    int nh_gpos=next_hit->left() + 1;
318 >    str_appendInt(aux, nh_gpos);
319      auxdata.push_back(aux);
320 <    if (next_hit) {
194 <        const char* nh_ref_name = "=";
195 <        nh_ref_name = rt.get_name(next_hit->ref_id());
196 <        assert (nh_ref_name != NULL);
197 <        bool same_contig=(next_hit->ref_id()==bh.ref_id());
198 <        aux="CC:Z:";
199 <        aux+= (same_contig ? "=" : nh_ref_name);
200 <        auxdata.push_back(aux);
201 <        aux="CP:i:";
202 <        int nh_gpos=next_hit->left() + 1;
203 <        str_appendInt(aux, nh_gpos);
204 <        auxdata.push_back(aux);
205 <    } //has next_hit
320 >  } //has next_hit
321      // FIXME: this code is still a bit brittle, because it contains no
322      // consistency check that the mates are on opposite strands - a current protocol
323      // requirement, and that the strand indicated by the alignment is consistent
324      // with the orientation of the splices (though that should be handled upstream).
325 +  if (!XS_found) {
326      const string xs_f("XS:A:+");
327      const string xs_r("XS:A:-");
328 <    if (bh.contiguous())  {
329 <        if (library_type == FR_FIRSTSTRAND) {
330 <            if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
331 <                if (bh.antisense_align())
332 <                    auxdata.push_back(xs_f);
333 <                else
334 <                    auxdata.push_back(xs_r);
335 <            }
336 <            else {
337 <                if (bh.antisense_align())
338 <                    auxdata.push_back(xs_r);
339 <                else
340 <                    auxdata.push_back(xs_f);
341 <            }
342 <        }
343 <        else if (library_type == FR_SECONDSTRAND)   {
344 <            if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
345 <                if (bh.antisense_align())
346 <                    auxdata.push_back(xs_r);
347 <                else
348 <                    auxdata.push_back(xs_f);
349 <            }
350 <            else
351 <            {
352 <                if (bh.antisense_align())
353 <                    auxdata.push_back(xs_f);
354 <                else
355 <                    auxdata.push_back(xs_r);
356 <            }
357 <        }
358 <    } //bh.contiguous()
243 <    if (hitIndex >= 0)
328 >    if (library_type == FR_FIRSTSTRAND) {
329 >      if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
330 >        if (bh.antisense_align())
331 >          auxdata.push_back(xs_f);
332 >        else
333 >          auxdata.push_back(xs_r);
334 >      }
335 >      else {
336 >        if (bh.antisense_align())
337 >          auxdata.push_back(xs_r);
338 >        else
339 >          auxdata.push_back(xs_f);
340 >      }
341 >    }
342 >    else if (library_type == FR_SECONDSTRAND)   {
343 >      if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
344 >        if (bh.antisense_align())
345 >          auxdata.push_back(xs_r);
346 >        else
347 >          auxdata.push_back(xs_f);
348 >      }
349 >      else
350 >        {
351 >          if (bh.antisense_align())
352 >            auxdata.push_back(xs_f);
353 >          else
354 >            auxdata.push_back(xs_r);
355 >        }
356 >    }
357 >  }
358 >  if (hitIndex >= 0)
359      {
360 <        string aux("HI:i:");
361 <        str_appendInt(aux, hitIndex);
362 <        auxdata.push_back(aux);
360 >      string aux("HI:i:");
361 >      str_appendInt(aux, hitIndex);
362 >      auxdata.push_back(aux);
363      }
364   }
365  
# Line 253 | Line 368
368                          const BowtieHit& bh,
369                          const char* bwt_buf,
370                          const char* read_alt_name,
256                        const FragmentAlignmentGrade& grade,
371                          FragmentType insert_side,
372                          int num_hits,
373                          const BowtieHit* next_hit,
# Line 266 | Line 380
380    tokenize(bwt_buf, "\t", sam_toks);
381  
382    string ref_name = sam_toks[2], ref_name2 = "";
383 <  char rebuf2[2048] = {0};
270 <  char cigar1[256] = {0}, cigar2[256] = {0};
383 >  char cigar1[1024] = {0}, cigar2[1024] = {0};
384    string left_seq, right_seq, left_qual, right_qual;
385    int left = -1, left1 = -1, left2 = -1;
386    bool fusion_alignment = false;
387    size_t XF_index = 0;
388    for (size_t i = 11; i < sam_toks.size(); ++i)
389      {
390 <      const string& tok = sam_toks[i];
390 >      string& tok = sam_toks[i];
391        if (strncmp(tok.c_str(), "XF", 2) == 0)
392          {
393            fusion_alignment = true;
# Line 296 | Line 409
409            
410            break;
411          }
412 +
413 +      else if (strncmp(tok.c_str(), "AS", 2) == 0)
414 +        {
415 +          char AS_score[128] = {0};
416 +          sprintf(AS_score, "AS:i:%d", bh.alignment_score());
417 +          tok = AS_score;
418 +        }
419      }
420    
421    string qname(read_alt_name);
# Line 322 | Line 442
442    
443    int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
444    int mapQ=255;
445 <  if (grade.num_alignments > 1)  {
446 <    double err_prob = 1 - (1.0 / grade.num_alignments);
445 >  if (num_hits > 1)  {
446 >    double err_prob = 1 - (1.0 / num_hits);
447      mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
448    }
449    int tlen =atoi(sam_toks[8].c_str()); //TLEN
# Line 394 | Line 514
514      flag |= 0x100;
515  
516    string ref_name = sam_toks[2], ref_name2 = "";
517 <  char rebuf2[2048] = {0};
398 <  char cigar1[256] = {0}, cigar2[256] = {0};
517 >  char cigar1[1024] = {0}, cigar2[1024] = {0};
518    string left_seq, right_seq, left_qual, right_qual;
519    int left = -1, left1 = -1, left2 = -1;
520    bool fusion_alignment = false;
521    size_t XF_tok_idx = 11;
522    for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
523      {
524 <      const string& tok = sam_toks[XF_tok_idx];
524 >      string& tok = sam_toks[XF_tok_idx];
525        if (strncmp(tok.c_str(), "XF", 2) == 0)
526          {
527            fusion_alignment = true;
# Line 423 | Line 542
542            
543            break;
544          }
545 +
546 +      else if (strncmp(tok.c_str(), "AS", 2) == 0)
547 +        {
548 +          char AS_score[128] = {0};
549 +          sprintf(AS_score, "AS:i:%d", bh.alignment_score());
550 +          tok = AS_score;
551 +        }
552      }
553    
554    int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
# Line 519 | Line 645
645  
646   void print_sam_for_single(const RefSequenceTable& rt,
647                            const HitsForRead& hits,
648 <                        const FragmentAlignmentGrade& grade,
649 <                        FragmentType frag_type,
650 <                        //FILE* reads_file,
651 <                        Read& read,
652 <                        GBamWriter& bam_writer,
527 <                        FILE* um_out//write unmapped reads here
528 <                        )
648 >                          FragmentType frag_type,
649 >                          Read& read,
650 >                          GBamWriter& bam_writer,
651 >                          GBamWriter* um_out //write unmapped reads here
652 >                          )
653   {
654      assert(!read.alt_name.empty());
655 +    if (hits.hits.empty())
656 +      return;
657 +    
658      lex_hit_sort s(rt, hits);
659      vector<uint32_t> index_vector;
660  
# Line 537 | Line 664
664      
665      sort(index_vector.begin(), index_vector.end(), s);
666  
667 <    size_t primaryHit = (hits.hits.empty()? 0: random() % hits.hits.size());
667 >    size_t primaryHit = 0;
668 >    if (!report_secondary_alignments)
669 >      primaryHit = random() % hits.hits.size();
670 >    
671      bool multipleHits = (hits.hits.size() > 1);
672      for (i = 0; i < hits.hits.size(); ++i)
673        {
544        bool primary = (i == primaryHit);
674          size_t index = index_vector[i];
675 +        bool primary = (index == primaryHit);
676          const BowtieHit& bh = hits.hits[index];
677          rewrite_sam_record(bam_writer, rt,
678                             bh,
679                             bh.hitfile_rec().c_str(),
680                             read.alt_name.c_str(),
551                           grade,
681                             frag_type,
682                             hits.hits.size(),
683                             (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
# Line 558 | Line 687
687   }
688  
689   void print_sam_for_pair(const RefSequenceTable& rt,
690 <                        const HitsForRead& left_hits,
562 <                        const HitsForRead& right_hits,
690 >                        const vector<pair<BowtieHit, BowtieHit> >& best_hits,
691                          const InsertAlignmentGrade& grade,
692                          ReadStream& left_reads_file,
693                          ReadStream& right_reads_file,
694                          GBamWriter& bam_writer,
695 <                        FILE* left_um_out,
696 <                        FILE* right_um_out,
695 >                        GBamWriter* left_um_out,
696 >                        GBamWriter* right_um_out,
697                          uint64_t begin_id = 0,
698                          uint64_t end_id = std::numeric_limits<uint64_t>::max())
699   {
572    assert (left_hits.insert_id == right_hits.insert_id);
573    
700      Read left_read;
701      Read right_read;
702 <    assert (left_hits.hits.size() == right_hits.hits.size() ||
703 <            (left_hits.hits.empty() || right_hits.hits.empty()));
702 >    if (best_hits.empty())
703 >      return;
704 >
705 >    size_t i;
706 >    HitsForRead right_hits;
707 >    for (i = 0; i < best_hits.size(); ++i)
708 >      right_hits.hits.push_back(best_hits[i].second);
709      
710      size_t primaryHit = 0;
711      vector<uint32_t> index_vector;
712 <    if(right_hits.hits.size() > 0)
713 <    {
714 <      lex_hit_sort s(rt, right_hits);
584 <      for (size_t i = 0; i < right_hits.hits.size(); ++i)
585 <        index_vector.push_back(i);
586 <
587 <          sort(index_vector.begin(), index_vector.end(), s);
588 <          primaryHit = random() % right_hits.hits.size();
589 <      }
590 <    else if (left_hits.hits.size() > 0)
591 <    {
592 <          lex_hit_sort s(rt, left_hits);
593 <          for (size_t i = 0; i < left_hits.hits.size(); ++i)
594 <              index_vector.push_back(i);
595 <
596 <          sort(index_vector.begin(), index_vector.end(), s);
597 <          primaryHit = random() % left_hits.hits.size();
598 <      }
712 >    lex_hit_sort s(rt, right_hits);
713 >    for (i = 0; i < right_hits.hits.size(); ++i)
714 >      index_vector.push_back(i);
715      
716 <    bool got_left_read = left_reads_file.getRead(left_hits.insert_id, left_read,
601 <                                                 reads_format, false, left_um_out,
602 <                                                 false, begin_id, end_id);
603 <    
604 <    bool got_right_read = right_reads_file.getRead(right_hits.insert_id, right_read,
605 <                                                   reads_format, false, right_um_out,
606 <                                                   false, begin_id, end_id);
607 <    
608 <    if (left_hits.hits.size() == right_hits.hits.size())
609 <    {
610 <        assert (got_left_read && got_right_read);
611 <        bool multipleHits = (left_hits.hits.size() > 1);
612 <        for (size_t i = 0; i < right_hits.hits.size(); ++i)
613 <        {
614 <            bool primary = (i == primaryHit);
615 <            size_t index = index_vector[i];
616 <            const BowtieHit& right_bh = right_hits.hits[index];
617 <            const BowtieHit& left_bh = left_hits.hits[index];
618 <            
619 <            rewrite_sam_record(bam_writer, rt,
620 <                               right_bh,
621 <                               right_bh.hitfile_rec().c_str(),
622 <                               right_read.alt_name.c_str(),
623 <                               grade,
624 <                               FRAG_RIGHT,
625 <                               &left_bh,
626 <                               right_hits.hits.size(),
627 <                               (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
628 <                               primary,
629 <                               (multipleHits? i: -1));
630 <            rewrite_sam_record(bam_writer, rt,
631 <                               left_bh,
632 <                               left_bh.hitfile_rec().c_str(),
633 <                               left_read.alt_name.c_str(),
634 <                               grade,
635 <                               FRAG_LEFT,
636 <                               &right_bh,
637 <                               left_hits.hits.size(),
638 <                               (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
639 <                               primary,
640 <                               (multipleHits? i: -1));
641 <        }
642 <    }
643 <    else if (left_hits.hits.empty())
644 <    { //only right read was mapped properly
645 <        if (right_um_out) {
646 <          //write it in the mapped file with the #MAPPED# flag
647 <          fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", right_read.alt_name.c_str(),
648 <                              right_read.seq.c_str(), right_read.qual.c_str());
649 <          }
650 <        for (size_t i = 0; i < right_hits.hits.size(); ++i)
651 <        {
652 <            bool primary = (i == primaryHit);
653 <            size_t index = index_vector[i];
654 <            const BowtieHit& bh = right_hits.hits[index];
655 <            
656 <            rewrite_sam_record(bam_writer, rt,
657 <                               bh,
658 <                               bh.hitfile_rec().c_str(),
659 <                               right_read.alt_name.c_str(),
660 <                               grade,
661 <                               FRAG_RIGHT,
662 <                               NULL,
663 <                               right_hits.hits.size(),
664 <                               (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
665 <                               primary,
666 <                               -1);
667 <            
668 <        }
669 <    }
670 <    else if (right_hits.hits.empty())
671 <    { //only left read was mapped properly
672 <      if (left_um_out) {
673 <        //write it in the mapped file with the #MAPPED# flag
674 <        fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", left_read.alt_name.c_str(), left_read.seq.c_str(),
675 <                            left_read.qual.c_str());
676 <        }
716 >    sort(index_vector.begin(), index_vector.end(), s);
717  
718 <        for (size_t i = 0; i < left_hits.hits.size(); ++i)
719 <        {
720 <            bool primary = (i == primaryHit);
721 <            size_t index = index_vector[i];
722 <            const BowtieHit& bh = left_hits.hits[index];
723 <            rewrite_sam_record(bam_writer, rt,
724 <                               bh,
725 <                               bh.hitfile_rec().c_str(),
726 <                               left_read.alt_name.c_str(),
727 <                               grade,
728 <                               FRAG_LEFT,
729 <                               NULL,
730 <                               left_hits.hits.size(),
731 <                               (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
732 <                               primary,                  
733 <                               -1);
734 <            
735 <        }
736 <    }
737 <    else
738 <    {
739 <        assert (false);
740 <    }
718 >    if (!report_secondary_alignments)
719 >      primaryHit = random() % right_hits.hits.size();
720 >    
721 >    bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
722 >                                                 reads_format, false, begin_id, end_id,
723 >                                                 left_um_out, false);
724 >    
725 >    bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
726 >                                                   reads_format, false, begin_id, end_id,
727 >                                                   right_um_out, false);
728 >    
729 >    assert (got_left_read && got_right_read);
730 >    bool multipleHits = (best_hits.size() > 1);
731 >    for (i = 0; i < best_hits.size(); ++i)
732 >      {
733 >        size_t index = index_vector[i];
734 >        bool primary = (index == primaryHit);
735 >        const BowtieHit& right_bh = best_hits[index].second;
736 >        const BowtieHit& left_bh = best_hits[index].first;
737 >        
738 >        rewrite_sam_record(bam_writer, rt,
739 >                           right_bh,
740 >                           right_bh.hitfile_rec().c_str(),
741 >                           right_read.alt_name.c_str(),
742 >                           grade,
743 >                           FRAG_RIGHT,
744 >                           &left_bh,
745 >                           best_hits.size(),
746 >                           (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
747 >                           primary,
748 >                           (multipleHits? i: -1));
749 >        rewrite_sam_record(bam_writer, rt,
750 >                           left_bh,
751 >                           left_bh.hitfile_rec().c_str(),
752 >                           left_read.alt_name.c_str(),
753 >                           grade,
754 >                           FRAG_LEFT,
755 >                           &right_bh,
756 >                           best_hits.size(),
757 >                           (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
758 >                           primary,
759 >                           (multipleHits? i: -1));
760 >      }
761   }
762  
763   /**
# Line 710 | Line 770
770                                       InsertionSet& insertions,
771                                       DeletionSet& deletions)
772   {
773 <        for (size_t i = 0; i < hits.hits.size(); ++i)
773 >  for (size_t i = 0; i < hits.hits.size(); ++i)
774 >    {
775 >      const BowtieHit& bh = hits.hits[i];
776 >      insertions_from_alignment(bh, insertions);
777 >      deletions_from_alignment(bh, deletions);
778 >    }
779 > }
780 >
781 > void update_coverage(const HitsForRead& hits,
782 >                     Coverage& coverage)
783 > {
784 >  for (size_t i = 0; i < hits.hits.size(); ++i)
785 >    {
786 >      const BowtieHit& hit = hits.hits[i];
787 >      const vector<CigarOp>& cigar = hit.cigar();
788 >      unsigned int positionInGenome = hit.left();
789 >      RefID ref_id = hit.ref_id();
790 >      
791 >      for(size_t c = 0; c < cigar.size(); ++c)
792          {
793 <                const BowtieHit& bh = hits.hits[i];
794 <                insertions_from_alignment(bh, insertions);
795 <                deletions_from_alignment(bh, deletions);
793 >          int opcode = cigar[c].opcode;
794 >          int length = cigar[c].length;
795 >          switch(opcode)
796 >            {
797 >            case REF_SKIP:
798 >            case MATCH:
799 >            case DEL:
800 >              if (opcode == MATCH)
801 >                coverage.add_coverage(ref_id, positionInGenome, length);
802 >      
803 >              positionInGenome += length;
804 >              break;
805 >            case rEF_SKIP:
806 >            case mATCH:
807 >            case dEL:
808 >              positionInGenome -= length;
809 >
810 >              if (opcode == mATCH)
811 >                coverage.add_coverage(ref_id, positionInGenome + 1, length);
812 >              break;
813 >            case FUSION_FF:
814 >            case FUSION_FR:
815 >            case FUSION_RF:
816 >            case FUSION_RR:
817 >                positionInGenome = length;
818 >                ref_id = hit.ref_id2();
819 >              break;
820 >            default:
821 >              break;
822 >            }  
823          }
824 +    }
825   }
826  
827  
722 FusionSet empty_fusions;
828   void update_fusions(const HitsForRead& hits,
829                      RefSequenceTable& rt,
830                      FusionSet& fusions,
831 <                    FusionSet& fusions_ref = empty_fusions)
831 >                    const FusionSet& fusions_ref = empty_fusions)
832   {
833 +  if (!fusion_search)
834 +    return;
835 +  
836    if (hits.hits.size() > fusion_multireads)
837      return;
838  
# Line 736 | Line 844
844        if (bh.edit_dist() > fusion_read_mismatches)
845          continue;
846  
739      // daehwan
740 #if 0
741      vector<Fusion> new_fusions;
742      fusions_from_spliced_hit(bh, new_fusions);
743      
744      for(size_t i = 0; i < new_fusions.size(); ++i)
745        {
746          Fusion fusion = new_fusions[i];
747          if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1)
748            {
749              for (size_t t = 0; t < hits.hits.size(); ++t)
750                {
751                  const BowtieHit& lh = hits.hits[t];
752                  cout << "insert id: " << lh.insert_id() << endl;
753                  cout << (lh.antisense_align() ? "-" : "+") <<  endl;
754                  cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
755                  cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
756                }
757            }
758        }
759 #endif
760
847        fusions_from_alignment(bh, fusions, rt, update_stat);
848  
849        if (update_stat)
# Line 768 | Line 854
854   void update_junctions(const HitsForRead& hits,
855                        JunctionSet& junctions)
856   {
857 <        for (size_t i = 0; i < hits.hits.size(); ++i)
858 <        {
859 <                const BowtieHit& bh = hits.hits[i];
860 <                junctions_from_alignment(bh, junctions);
861 <        }
857 >  for (size_t i = 0; i < hits.hits.size(); ++i)
858 >    {
859 >      const BowtieHit& bh = hits.hits[i];
860 >      junctions_from_alignment(bh, junctions);
861 >    }
862   }
863  
864   // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
# Line 785 | Line 871
871    for (size_t i = 0; i < hits.hits.size(); ++i)
872      {
873        BowtieHit& bh = hits.hits[i];
874 +      if (bh.edit_dist() > max_read_mismatches)
875 +        continue;
876 +      
877        bool filter_hit = false;
878        if (!bh.contiguous())
879          {
# Line 846 | Line 935
935            {
936              HitsForRead best_hits;
937              best_hits.insert_id = curr_left_obs_order;
849            FragmentAlignmentGrade grade;
938              
939              // Process hits for left singleton, select best alignments
940 <            read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
940 >            read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions);
941 >            update_coverage(best_hits, *coverage);
942              update_junctions(best_hits, *junctions);
943 +            update_insertions_and_deletions(best_hits, *insertions, *deletions);
944              update_fusions(best_hits, *rt, *fusions);
945 <            
945 >                        
946              // Get next hit group
947              l_hs.next_read_hits(curr_left_hit_group);
948              curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
# Line 867 | Line 957
957              best_hits.insert_id = curr_right_obs_order;
958              if (curr_right_obs_order >= begin_id)
959                {
870                FragmentAlignmentGrade grade;
960                  // Process hit for right singleton, select best alignments
961 <                read_best_alignments(curr_right_hit_group,grade, best_hits, *gtf_junctions);
961 >                read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions);
962 >                update_coverage(best_hits, *coverage);
963                  update_junctions(best_hits, *junctions);
964 +                update_insertions_and_deletions(best_hits, *insertions, *deletions);
965                  update_fusions(best_hits, *rt, *fusions);
966                }
967              
# Line 885 | Line 976
976                 curr_left_obs_order < end_id &&
977                 curr_left_obs_order != VMAXINT32)
978            {
979 <            if (curr_left_hit_group.hits.empty())
980 <              {
981 <                HitsForRead right_best_hits;
982 <                right_best_hits.insert_id = curr_right_obs_order;
983 <                
984 <                FragmentAlignmentGrade grade;
985 <                read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
986 <                update_junctions(right_best_hits, *junctions);
987 <                update_fusions(right_best_hits, *rt, *fusions);
988 <              }
989 <            else if (curr_right_hit_group.hits.empty())
979 >            vector<pair<BowtieHit, BowtieHit> > best_hits;
980 >
981 >            InsertAlignmentGrade grade;
982 >            pair_best_alignments(curr_left_hit_group,
983 >                                 curr_right_hit_group,
984 >                                 grade,
985 >                                 best_hits,
986 >                                 *gtf_junctions);
987 >
988 >            HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
989 >            HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
990 >
991 >            for (size_t i = 0; i < best_hits.size(); ++i)
992                {
993 <                HitsForRead left_best_hits;
994 <                left_best_hits.insert_id = curr_left_obs_order;
902 <                
903 <                FragmentAlignmentGrade grade;
904 <                // Process hits for left singleton, select best alignments
905 <                read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
906 <                update_junctions(left_best_hits, *junctions);
907 <                update_fusions(left_best_hits, *rt, *fusions);
993 >                best_left_hit_group.hits.push_back(best_hits[i].first);
994 >                best_right_hit_group.hits.push_back(best_hits[i].second);
995                }
909            else
910              {
911                HitsForRead left_best_hits;
912                HitsForRead right_best_hits;
913                left_best_hits.insert_id = curr_left_obs_order;
914                right_best_hits.insert_id = curr_right_obs_order;
915                
916                // daehwan - apply gtf_junctions here, too!
996  
997 <                InsertAlignmentGrade grade;
998 <                pair_best_alignments(curr_left_hit_group,
999 <                                     curr_right_hit_group,
1000 <                                     grade,
1001 <                                     left_best_hits,
1002 <                                     right_best_hits);
1003 <                update_junctions(left_best_hits, *junctions);
1004 <                update_junctions(right_best_hits, *junctions);
1005 <                update_fusions(left_best_hits, *rt, *fusions);
1006 <                update_fusions(right_best_hits, *rt, *fusions);
928 <              }
929 <            
997 >            update_coverage(best_left_hit_group, *coverage);
998 >            update_junctions(best_left_hit_group, *junctions);
999 >            update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions);
1000 >            update_fusions(best_left_hit_group, *rt, *fusions);
1001 >
1002 >            update_coverage(best_right_hit_group, *coverage);
1003 >            update_junctions(best_right_hit_group, *junctions);
1004 >            update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions);
1005 >            update_fusions(best_right_hit_group, *rt, *fusions);
1006 >                    
1007              l_hs.next_read_hits(curr_left_hit_group);
1008              curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1009              
# Line 944 | Line 1021
1021    string left_map_fname;
1022    string right_map_fname;
1023  
947  JunctionSet* junctions;
948  JunctionSet* gtf_junctions;
949  FusionSet* fusions;
1024    RefSequenceTable* rt;
1025  
1026 +  JunctionSet* gtf_junctions;
1027 +
1028    uint64_t begin_id;
1029    uint64_t end_id;
1030    int64_t left_map_offset;
1031    int64_t right_map_offset;
1032 +
1033 +  JunctionSet* junctions;
1034 +  InsertionSet* insertions;
1035 +  DeletionSet* deletions;
1036 +  FusionSet* fusions;
1037 +
1038 +  Coverage* coverage;
1039   };
1040  
1041   struct ReportWorker
# Line 965 | Line 1048
1048      ReadStream left_reads_file(left_reads_fname);
1049      if (left_reads_file.file() == NULL)
1050        err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
1051 <
1051 >    if (left_reads_file.isBam()) {
1052 >        left_reads_file.use_alt_name();
1053 >        left_reads_file.ignoreQC();
1054 >        }
1055      if (left_reads_offset > 0)
1056        left_reads_file.seek(left_reads_offset);
1057      
1058 <    if (!zpacker.empty()) left_um_fname+=".z";
1059 <    FZPipe left_um_out;
1060 <    if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
975 <      err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
1058 >    //if (!zpacker.empty()) left_um_fname+=".z";
1059 >    GBamWriter* left_um_out=new GBamWriter(left_um_fname.c_str(), sam_header.c_str());
1060 >    GBamWriter* right_um_out=NULL;
1061      
1062 +    //if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
1063 +    //  err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
1064      ReadStream right_reads_file(right_reads_fname);
1065      if (right_reads_offset > 0)
1066        right_reads_file.seek(right_reads_offset);
1067      
1068 <    FZPipe right_um_out;
1069 <    if (right_reads_fname != "")
1068 >    //FZPipe right_um_out;
1069 >    if (!right_reads_fname.empty())
1070        {
1071 <        if (!zpacker.empty()) right_um_fname+=".z";
1072 <        if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
1073 <          err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
1071 >      if (right_reads_file.isBam()) {
1072 >        right_reads_file.use_alt_name();
1073 >        right_reads_file.ignoreQC();
1074 >        right_um_out=new GBamWriter(right_um_fname.c_str(), sam_header.c_str());
1075 >      }
1076 >        //if (!zpacker.empty()) right_um_fname+=".z";
1077 >        //if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
1078 >        //  err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
1079        }
1080  
1081      vector<BAMHitFactory*> hit_factories;
# Line 1006 | Line 1098
1098      uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1099      uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1100  
1101 +    const bool final_report = true;
1102 +
1103      // While we still have unreported hits...
1104      Read l_read;
1105      Read r_read;
# Line 1019 | Line 1113
1113            {
1114              HitsForRead best_hits;
1115              best_hits.insert_id = curr_left_obs_order;
1022            FragmentAlignmentGrade grade;
1116              bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1117 <                                                  reads_format, false, left_um_out.file,
1118 <                                                  false, begin_id, end_id);
1119 <            assert(got_read);
1117 >                                                  reads_format, false, begin_id, end_id,
1118 >                                                  left_um_out, false);
1119 >            //assert(got_read);
1120  
1121              if (right_reads_file.file()) {
1029              fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1030                      l_read.seq.c_str(), l_read.qual.c_str());
1122                got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1123 <                                                reads_format, false, right_um_out.file,
1124 <                                                true, begin_id, end_id);
1125 <              assert(got_read);
1123 >                                                reads_format, false, begin_id, end_id,
1124 >                                                right_um_out, true);
1125 >              //assert(got_read);
1126              }
1127 <
1127 >    
1128              exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1129  
1130              // Process hits for left singleton, select best alignments
1131 <            read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
1131 >            read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions,
1132 >                                 *junctions, *insertions, *deletions, *fusions, *coverage,
1133 >                                 final_report);
1134  
1135 <            if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1135 >            if (best_hits.hits.size() > 0)
1136                {
1137                  update_junctions(best_hits, *final_junctions);
1138                  update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
# Line 1047 | Line 1140
1140  
1141                  print_sam_for_single(*rt,
1142                                       best_hits,
1050                                     grade,
1143                                       (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1144                                       l_read,
1145                                       bam_writer,
1146 <                                     left_um_out.file);
1146 >                                     left_um_out);
1147                }
1148              // Get next hit group
1149              left_hs.next_read_hits(curr_left_hit_group);
# Line 1065 | Line 1157
1157            {
1158              HitsForRead best_hits;
1159              best_hits.insert_id = curr_right_obs_order;
1068            FragmentAlignmentGrade grade;
1160  
1161              if (curr_right_obs_order >=  begin_id)
1162                {
1163                  bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1164 <                                                       reads_format, false, right_um_out.file,
1165 <                                                       false, begin_id, end_id);
1075 <
1076 <                assert(got_read);
1164 >                                                       reads_format, false, begin_id, end_id,
1165 >                                                       right_um_out, false);
1166  
1167 +                //assert(got_read);
1168 +                /*
1169                  fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1170                          r_read.seq.c_str(), r_read.qual.c_str());
1171 +                */
1172                  got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1173 <                                                 reads_format, false, left_um_out.file,
1174 <                                                 true, begin_id, end_id);
1175 <                assert(got_read);
1173 >                                                 reads_format, false, begin_id, end_id,
1174 >                                                 left_um_out, true);
1175 >                //assert(got_read);
1176  
1177                  exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1178  
1179                  // Process hit for right singleton, select best alignments
1180 <                read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions);
1181 <                if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1180 >                read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions,
1181 >                                     *junctions, *insertions, *deletions, *fusions, *coverage,
1182 >                                     final_report);
1183 >
1184 >                if (best_hits.hits.size() > 0)
1185                    {
1186                      update_junctions(best_hits, *final_junctions);
1187                      update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
# Line 1094 | Line 1189
1189                      
1190                      print_sam_for_single(*rt,
1191                                           best_hits,
1097                                         grade,
1192                                           FRAG_RIGHT,
1193                                           r_read,
1194                                           bam_writer,
1195 <                                         right_um_out.file);
1195 >                                         right_um_out);
1196                    }
1197                }
1198  
# Line 1120 | Line 1214
1214                  //write it in the mapped file with the #MAPPED# flag
1215                  
1216                  bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1217 <                                                       reads_format, false, right_um_out.file,
1218 <                                                       false, begin_id, end_id);
1219 <                assert(got_read);
1217 >                                                       reads_format, false, begin_id, end_id,
1218 >                                                       right_um_out, false);
1219 >                //assert(got_read);
1220 >                /*
1221                  fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1222                          r_read.seq.c_str(), r_read.qual.c_str());
1223 +                */
1224                  HitsForRead right_best_hits;
1225                  right_best_hits.insert_id = curr_right_obs_order;
1226                  
1227 <                FragmentAlignmentGrade grade;
1228 <                read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
1229 <                
1230 <                if (right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1227 >                read_best_alignments(curr_right_hit_group, right_best_hits, *gtf_junctions,
1228 >                                     *junctions, *insertions, *deletions, *fusions, *coverage,
1229 >                                     final_report);
1230 >
1231 >                if (right_best_hits.hits.size() > 0)
1232                    {
1233                      update_junctions(right_best_hits, *final_junctions);
1234                      update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
# Line 1139 | Line 1236
1236                      
1237                      print_sam_for_single(*rt,
1238                                           right_best_hits,
1142                                         grade,
1239                                           FRAG_RIGHT,
1240                                           r_read,
1241                                           bam_writer,
1242 <                                         right_um_out.file);
1242 >                                         right_um_out);
1243                    }
1244                }
1245              else if (curr_right_hit_group.hits.empty())
# Line 1152 | Line 1248
1248                  left_best_hits.insert_id = curr_left_obs_order;
1249                  //only left read mapped
1250                  bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1251 <                                                      reads_format, false, left_um_out.file,
1252 <                                                      false, begin_id, end_id);
1253 <                assert(got_read);
1158 <                fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1159 <                        l_read.seq.c_str(), l_read.qual.c_str());
1160 <                FragmentAlignmentGrade grade;
1251 >                                                      reads_format, false, begin_id, end_id,
1252 >                                                      left_um_out, false);
1253 >
1254                  // Process hits for left singleton, select best alignments
1255 <                read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
1256 <                if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits)
1255 >                read_best_alignments(curr_left_hit_group, left_best_hits, *gtf_junctions,
1256 >                                     *junctions, *insertions, *deletions, *fusions, *coverage,
1257 >                                     final_report);
1258 >                
1259 >                if (left_best_hits.hits.size() > 0)
1260                    {
1261                      update_junctions(left_best_hits, *final_junctions);
1262                      update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
# Line 1168 | Line 1264
1264                      
1265                      print_sam_for_single(*rt,
1266                                           left_best_hits,
1171                                         grade,
1267                                           FRAG_LEFT,
1268                                           l_read,
1269                                           bam_writer,
1270 <                                         left_um_out.file);
1270 >                                         left_um_out);
1271                    }
1272                }
1273              else
1274 <              {   //hits for both left and right reads
1275 <                HitsForRead left_best_hits;
1181 <                HitsForRead right_best_hits;
1182 <                left_best_hits.insert_id = curr_left_obs_order;
1183 <                right_best_hits.insert_id = curr_right_obs_order;
1274 >              {
1275 >                vector<pair<BowtieHit, BowtieHit> > best_hits;
1276  
1277                  InsertAlignmentGrade grade;
1278                  pair_best_alignments(curr_left_hit_group,
1279                                       curr_right_hit_group,
1280 <                                     grade,
1281 <                                     left_best_hits,
1282 <                                     right_best_hits);
1280 >                                     grade, best_hits, *gtf_junctions,
1281 >                                     *junctions, *insertions, *deletions, *fusions, *coverage,
1282 >                                     final_report);
1283 >
1284 >                HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
1285 >                HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
1286                  
1287 <                if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits &&
1193 <                    right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1287 >                for (size_t i = 0; i < best_hits.size(); ++i)
1288                    {
1289 <                    update_junctions(left_best_hits, *final_junctions);
1290 <                    update_junctions(right_best_hits, *final_junctions);
1291 <                    update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1198 <                    update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1289 >                    best_left_hit_group.hits.push_back(best_hits[i].first);
1290 >                    best_right_hit_group.hits.push_back(best_hits[i].second);
1291 >                  }
1292  
1293 <                    pair_support(left_best_hits, right_best_hits, *final_fusions, *fusions);
1294 <                    update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1295 <                    update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1293 >                if (best_hits.size() > 0)
1294 >                  {
1295 >                    update_junctions(best_left_hit_group, *final_junctions);
1296 >                    update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions);
1297 >                    update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions);
1298 >
1299 >                    update_junctions(best_right_hit_group, *final_junctions);
1300 >                    update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions);
1301 >                    update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions);
1302 >
1303 >                    pair_support(best_hits, *final_fusions, *fusions);
1304  
1305                      print_sam_for_pair(*rt,
1306 <                                       left_best_hits,
1206 <                                       right_best_hits,
1306 >                                       best_hits,
1307                                         grade,
1308                                         left_reads_file,
1309                                         right_reads_file,
1310                                         bam_writer,
1311 <                                       left_um_out.file,
1312 <                                       right_um_out.file,
1311 >                                       left_um_out,
1312 >                                       right_um_out,
1313                                         begin_id,
1314                                         end_id);
1315                    }
# Line 1223 | Line 1323
1323            }
1324        } //while we still have unreported hits..
1325      //print the remaining unmapped reads at the end of each reads' stream
1326 +
1327      left_reads_file.getRead(VMAXINT32, l_read,
1328 <                            reads_format, false, left_um_out.file,
1329 <                            false, begin_id, end_id);
1328 >                            reads_format, false, begin_id, end_id,
1329 >                            left_um_out, false);
1330      if (right_reads_file.file())
1331        right_reads_file.getRead(VMAXINT32, r_read,
1332 <                               reads_format, false, right_um_out.file,
1333 <                               false, begin_id, end_id);
1332 >                               reads_format, false, begin_id, end_id,
1333 >                               right_um_out, false);
1334 >
1335 >
1336 >    // pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow,
1337 >    // resulting in deadlock like behavior.
1338 >    delete left_um_out;
1339 >    delete right_um_out;
1340  
1234    left_um_out.close();
1235    right_um_out.close();
1341  
1342      for (size_t i = 0; i < hit_factories.size(); ++i)
1343        delete hit_factories[i];
# Line 1252 | Line 1357
1357    string right_um_fname;
1358  
1359    JunctionSet* gtf_junctions;
1360 +  
1361    JunctionSet* junctions;
1362 +  InsertionSet* insertions;
1363 +  DeletionSet* deletions;
1364    FusionSet* fusions;
1365 +  Coverage* coverage;
1366 +
1367    JunctionSet* final_junctions;
1368    InsertionSet* final_insertions;
1369    DeletionSet* final_deletions;
# Line 1293 | Line 1403
1403    JunctionSet gtf_junctions;
1404    if (!gtf_juncs.empty())
1405      {
1406 <      char splice_buf[2048];
1406 >      char splice_buf[4096];
1407        FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1408        if (splice_coords)
1409          {
# Line 1320 | Line 1430
1430                uint32_t right_coord = atoi(scan_right_coord);
1431                bool antisense = *orientation == '-';
1432  
1433 <              // add 1 to left_coord to meet BED format
1324 <              gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
1433 >              gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
1434              }
1435          }
1436        fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
# Line 1345 | Line 1454
1454      }
1455  
1456    JunctionSet vjunctions[num_threads];
1457 +  InsertionSet vinsertions[num_threads];
1458 +  DeletionSet vdeletions[num_threads];
1459    FusionSet vfusions[num_threads];
1460 +  Coverage vcoverages[num_threads];
1461 +  
1462    vector<boost::thread*> threads;
1463    for (int i = 0; i < num_threads; ++i)
1464      {
# Line 1353 | Line 1466
1466  
1467        worker.left_map_fname = left_map_fname;
1468        worker.right_map_fname = right_map_fname;
1469 +      worker.rt = &rt;
1470 +      worker.gtf_junctions = &gtf_junctions;
1471 +      
1472        worker.junctions = &vjunctions[i];
1473 +      worker.insertions = &vinsertions[i];
1474 +      worker.deletions = &vdeletions[i];
1475        worker.fusions = &vfusions[i];
1476 <      worker.gtf_junctions = &gtf_junctions;
1359 <      worker.rt = &rt;
1476 >      worker.coverage = &vcoverages[i];
1477  
1478        worker.right_map_offset = 0;
1479        
# Line 1391 | Line 1508
1508      }
1509    threads.clear();
1510  
1511 <  JunctionSet junctions = vjunctions[0];
1512 <  FusionSet fusions = vfusions[0];
1511 >  JunctionSet& junctions = vjunctions[0];
1512 >  InsertionSet& insertions = vinsertions[0];
1513 >  DeletionSet& deletions = vdeletions[0];
1514 >  FusionSet& fusions = vfusions[0];
1515 >  Coverage& coverage = vcoverages[0];
1516    for (size_t i = 1; i < num_threads; ++i)
1517      {
1518        merge_with(junctions, vjunctions[i]);
1519        vjunctions[i].clear();
1520  
1521 +      merge_with(insertions, vinsertions[i]);
1522 +      vinsertions[i].clear();
1523 +      
1524 +      merge_with(deletions, vdeletions[i]);
1525 +      vdeletions[i].clear();
1526 +
1527        merge_with(fusions, vfusions[i]);
1528        vfusions[i].clear();
1529 +
1530 +      coverage.merge_with(vcoverages[i]);
1531 +      vcoverages[i].clear();
1532      }
1533  
1534 +  coverage.calculate_coverage();
1535 +
1536    size_t num_unfiltered_juncs = junctions.size();
1537    fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1538  
1539    // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1409
1540    filter_junctions(junctions, gtf_junctions);
1541    
1542    //size_t num_juncs_after_filter = junctions.size();
# Line 1414 | Line 1544
1544    //     num_unfiltered_juncs - num_juncs_after_filter);
1545  
1546    /*
1547 <        size_t small_overhangs = 0;
1548 <        for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1547 >    size_t small_overhangs = 0;
1548 >    for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1549      {
1550 <        if (i->second.accepted &&
1551 <            (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1552 <            {
1553 <            small_overhangs++;
1554 <            }
1550 >    if (i->second.accepted &&
1551 >    (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1552 >    {
1553 >    small_overhangs++;
1554 >    }
1555      }
1556      
1557 <        if (small_overhangs >0)
1558 <        fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1557 >    if (small_overhangs >0)
1558 >    fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1559    */
1560  
1561    JunctionSet vfinal_junctions[num_threads];
# Line 1437 | Line 1567
1567      {
1568        ReportWorker worker;
1569  
1570 +      worker.sam_header_fname = sam_header;
1571        char filename[1024] = {0};
1572        sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1573        worker.bam_output_fname = filename;
1574 <      worker.sam_header_fname = sam_header;
1575 <      sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1576 <      worker.left_um_fname = filename;
1574 >      string tmpoutdir=getFdir(worker.bam_output_fname);
1575 >      worker.left_um_fname = tmpoutdir;
1576 >      sprintf(filename, "unmapped_left_%d.bam", i);
1577 >      worker.left_um_fname+=filename;
1578        
1579        if (right_reads_fname != "")
1580          {
1581 <          sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1582 <          worker.right_um_fname = filename;
1581 >          sprintf(filename, "unmapped_right_%d.bam", i);
1582 >          worker.right_um_fname = tmpoutdir;
1583 >          worker.right_um_fname += filename;
1584          }
1585        
1586        worker.left_reads_fname = left_reads_fname;
# Line 1457 | Line 1590
1590  
1591        worker.gtf_junctions = &gtf_junctions;
1592        worker.junctions = &junctions;
1593 +      worker.insertions = &insertions;
1594 +      worker.deletions = &deletions;
1595        worker.fusions = &fusions;
1596 +      worker.coverage = &coverage;
1597 +      
1598        worker.final_junctions = &vfinal_junctions[i];
1599        worker.final_insertions = &vfinal_insertions[i];
1600        worker.final_deletions = &vfinal_deletions[i];

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines