ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/inserts.cpp
Revision: 233
Committed: Wed Apr 11 21:05:01 2012 UTC (12 years, 4 months ago) by gpertea
File size: 8435 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 29 /*
2     * inserts.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 1/14/09.
6     * Copyright 2009 Cole Trapnell. All rights reserved.
7     *
8     */
9     #ifdef HAVE_CONFIG_H
10     #include <config.h>
11     #endif
12    
13 gpertea 154 #include <iostream>
14 gpertea 29 #include <cassert>
15     #include <map>
16     #include <set>
17     #include "bwt_map.h"
18     #include "common.h"
19     #include "inserts.h"
20 gpertea 154 using namespace std;
21 gpertea 29
22     bool InsertAlignmentGrade::operator<(const InsertAlignmentGrade& rhs)
23     {
24 gpertea 154 if (fusion && !rhs.fusion)
25     return true;
26    
27     if (!fusion && rhs.fusion)
28     return false;
29    
30     // We always prefer a insert alignment with both ends mapped than a
31     // singleton
32     if (num_mapped != rhs.num_mapped)
33     {
34     return num_mapped < rhs.num_mapped;
35     }
36 gpertea 200 else if (num_mapped == 2)
37 gpertea 154 {
38 gpertea 200 // daehwan - I'm testing this!
39     #if 1
40     if (alignment_score != rhs.alignment_score)
41     return alignment_score < rhs.alignment_score;
42    
43     return false;
44     #else
45 gpertea 154 // if significant difference in their inner mate distances
46     if (abs(rhs.inner_dist - inner_dist) >= 30)
47 gpertea 29 {
48 gpertea 154 // Prefer a pair that is too close or perfect to one that is too far
49     if (too_far && !rhs.too_far)
50     return true;
51    
52     // Prefer a pair that is perfect to one that is too close
53     if (too_close && !(rhs.too_close || rhs.too_far))
54     return true;
55    
56     // Prefer closer mates
57     if (rhs.inner_dist < inner_dist)
58     return true;
59 gpertea 29 }
60 gpertea 135
61 gpertea 154 if (edit_dist != rhs.edit_dist)
62     return rhs.edit_dist < edit_dist;
63 gpertea 135
64 gpertea 154 // daehwan - do somethings here
65     if (!bowtie2)
66     {
67     // Prefer shorter introns
68     if (longest_ref_skip != rhs.longest_ref_skip)
69     return rhs.longest_ref_skip < longest_ref_skip;
70 gpertea 29 }
71 gpertea 200 #endif
72 gpertea 154
73     return false;
74     }
75     else
76     {
77     // We prefer a singleton mapping to an insert with neither end mapped
78     if (num_mapped != rhs.num_mapped)
79 gpertea 29 {
80 gpertea 154 return num_mapped < rhs.num_mapped; // if RHS has MORE READS, RHS is BETTER (lhs < rhs)
81 gpertea 29 }
82 gpertea 154 else
83     {
84     if (rhs.num_spliced != num_spliced)
85     return rhs.num_spliced < num_spliced;// if RHS is LESS SPLICED, RHS is BETTER (lhs < rhs)
86    
87     // Prefer shorter introns
88     if (longest_ref_skip != rhs.longest_ref_skip)
89     return rhs.longest_ref_skip < longest_ref_skip; // if RHS intron is SHORTER, RHS is BETTER (lhs < rhs)
90    
91     return rhs.edit_dist < edit_dist; // if RHS edit is LOWER, RHS is BETTER (lhs < rhs)
92     }
93     }
94     return false;
95 gpertea 29 }
96    
97     bool gap_lt(const pair<int, int>& lhs, const pair<int, int>& rhs)
98     {
99 gpertea 154 return abs(lhs.second - lhs.first) < abs(rhs.second - rhs.first);
100 gpertea 29 }
101    
102 gpertea 154 pair<int, int> pair_distances(const BowtieHit& h1, const BowtieHit& h2)
103 gpertea 29 {
104 gpertea 154 int minor_hit_start, major_hit_start;
105     int minor_hit_end, major_hit_end;
106     if (h1.left() < h2.left())
107     {
108     minor_hit_start = (int)h1.left();
109     minor_hit_end = (int)h1.right();
110     major_hit_start = (int)h2.left();
111     major_hit_end = (int)h2.right();
112     }
113     else
114     {
115     minor_hit_start = (int)h2.left();
116     minor_hit_end = (int)h2.right();
117     major_hit_start = (int)h1.left();
118     major_hit_end = (int)h1.right();
119     }
120    
121     int inner_dist = major_hit_start - minor_hit_end;
122     int outer_dist = major_hit_end - minor_hit_start;
123     return make_pair(outer_dist, inner_dist);
124 gpertea 29 }
125    
126     void best_insert_mappings(uint64_t refid,
127 gpertea 154 ReadTable& it,
128     HitList& hits1_in_ref,
129     HitList& hits2_in_ref,
130     BestInsertAlignmentTable& best_status_for_inserts,
131     bool prefer_shorter_pairs)
132 gpertea 29 {
133 gpertea 154 long chucked_for_shorter_pair = 0;
134     std::set<size_t> marked;
135     HitList::iterator last_good = hits2_in_ref.begin();
136 gpertea 29
137 gpertea 154 for (size_t i = 0; i < hits1_in_ref.size(); ++i)
138     {
139     BowtieHit& h1 = hits1_in_ref[i];
140     pair<HitList::iterator, HitList::iterator> range_pair;
141     range_pair = equal_range(last_good, hits2_in_ref.end(),
142     h1, hit_insert_id_lt);
143     bool found_hit = false;
144     if (range_pair.first != range_pair.second)
145     last_good = range_pair.first;
146    
147     uint32_t obs_order = it.observation_order(h1.insert_id());
148    
149     for (HitList::iterator f = range_pair.first;
150     f != range_pair.second;
151     ++f)
152 gpertea 29 {
153 gpertea 154 BowtieHit& h2 = *f;
154    
155     if (h1.insert_id() == h2.insert_id())
156     {
157 gpertea 233 InsertAlignmentGrade s(h1, h2);
158 gpertea 154
159     pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
160     = best_status_for_inserts[obs_order];
161     InsertAlignmentGrade& current = insert_best.first;
162     // Is the new status better than the current best one?
163     if (current < s)
164 gpertea 29 {
165 gpertea 154 insert_best.second.clear();
166     current = s;
167     insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
168     }
169     else if (!(s < current))
170     {
171     if (prefer_shorter_pairs && current.num_mapped == 2)
172     {
173     pair<int, int> dc = pair_distances(*(insert_best.second[0].left_alignment), *(insert_best.second[0].right_alignment));
174     pair<int, int> ds = pair_distances(h1,h2);
175     if (ds.second < dc.second)
176 gpertea 29 {
177 gpertea 154 chucked_for_shorter_pair += insert_best.second.size();
178     insert_best.second.clear();
179     current = s;
180     insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
181 gpertea 29 }
182 gpertea 154 }
183     else
184     {
185     insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
186     }
187 gpertea 29 }
188 gpertea 154
189     marked.insert(f - hits2_in_ref.begin());
190     found_hit = true;
191     }
192    
193 gpertea 29 }
194 gpertea 154 if (!found_hit)
195 gpertea 29 {
196 gpertea 154 pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
197     = best_status_for_inserts[obs_order];
198     InsertAlignmentGrade& current = insert_best.first;
199    
200     InsertAlignmentGrade s(h1);
201    
202     if (current < s)
203     {
204     insert_best.second.clear();
205     current = s;
206     insert_best.second.push_back(InsertAlignment(refid, &h1, NULL));
207     }
208     else if (! (s < current))
209     {
210     insert_best.second.push_back(InsertAlignment(refid, &h1, NULL));
211     }
212    
213     }
214     }
215    
216     for (size_t i = 0; i < hits2_in_ref.size(); ++i)
217     {
218     BowtieHit& h2 = hits2_in_ref[i];
219     uint32_t obs_order = it.observation_order(h2.insert_id());
220     pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
221     = best_status_for_inserts[obs_order];
222     InsertAlignmentGrade& current = insert_best.first;
223    
224     InsertAlignmentGrade s(h2);
225     // Did we include h2 as part of a pairing already, or is this first time
226     // we've seen it? If so, it's a singleton.
227     if (marked.find(i) == marked.end())
228     {
229     if (current < s)
230     {
231     insert_best.second.clear();
232     current = s;
233     insert_best.second.push_back(InsertAlignment(refid, NULL, &h2));
234     }
235     else if (! (s < current))
236     {
237     insert_best.second.push_back(InsertAlignment(refid, NULL, &h2));
238     }
239     }
240     }
241     fprintf(stderr, "Chucked %ld pairs for shorter pairing of same mates\n", chucked_for_shorter_pair);
242 gpertea 29 }
243    
244     int long_spliced = 0;
245     int short_spliced = 0;
246     int singleton_splices = 0;
247    
248     bool valid_insert_alignment(const InsertAlignmentGrade& g, const InsertAlignment& a)
249     {
250 gpertea 154 if (!a.left_alignment || !a.right_alignment)
251     return false;
252     if (g.num_mapped == 2)
253     {
254     // Take all the contiguously mapped pairs
255     if (g.num_spliced == 0)
256     return true;
257    
258     // Take the pairs that include one or more spliced reads as long as
259     // the inner dist isn't too big
260     // if (g.one_spliced || g.both_spliced)
261     // {
262     // if (g.too_far || g.too_close)
263     // return false;
264     // }
265     return true;
266     }
267     return false;
268 gpertea 29 }
269    
270    
271     void insert_best_pairings(RefSequenceTable& rt,
272 gpertea 154 ReadTable& it,
273 gpertea 29 HitTable& hits1,
274     HitTable& hits2,
275     BestInsertAlignmentTable& best_pairings,
276 gpertea 154 bool prefer_shorter_pairs)
277 gpertea 29 {
278 gpertea 154 for(RefSequenceTable::const_iterator ci = rt.begin();
279     ci != rt.end();
280     ++ci)
281 gpertea 29 {
282 gpertea 154
283     // Tracks the number of singleton ALIGNMENTS, not the number of singleton
284     // READS in each Bowtie map.
285     vector<size_t> map1_singletons;
286     vector<size_t> map2_singletons;
287     vector<pair<size_t, size_t> > happy_mates;
288    
289     uint64_t ref_id = ci->second.observation_order;
290     HitList* hits1_in_ref = hits1.get_hits(ref_id);
291     HitList* hits2_in_ref = hits2.get_hits(ref_id);
292    
293     if (!hits1_in_ref || !hits2_in_ref)
294     continue;
295    
296     //if (verbose)
297     // fprintf(stderr, "Looking for best insert mappings in %s\n", name.c_str());
298    
299     best_insert_mappings(ref_id,
300     it,
301     *hits1_in_ref,
302     *hits2_in_ref,
303     best_pairings,
304     prefer_shorter_pairs);
305 gpertea 29 }
306     }