ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/closures.h
Revision: 245
Committed: Wed May 16 21:51:01 2012 UTC (12 years, 3 months ago) by gpertea
File size: 14200 byte(s)
Log Message:
merge-sync attempt

Line File contents
1 #ifndef CLOSURES_H
2 #define CLOSURES_H
3
4 /*
5 * closures.h
6 * TopHat
7 *
8 * Created by Cole Trapnell on 1/14/09.
9 * Copyright 2009 Cole Trapnell. All rights reserved.
10 *
11 */
12
13 #ifdef HAVE_CONFIG_H
14 #include <config.h>
15 #endif
16 #include <set>
17
18 #include "common.h"
19 #include "junctions.h"
20
21 struct SpliceMotif
22 {
23 SpliceMotif(uint32_t p, bool l) : pos(p), left_motif(l) {}
24 int pos : 31;
25 bool left_motif : 1;
26
27 bool operator<(const SpliceMotif& rhs) const
28 {
29 return this->pos < rhs.pos;
30 }
31
32 bool operator<=(const SpliceMotif& rhs) const
33 {
34 return this->pos <= rhs.pos;
35 }
36 };
37
38 // This simply finds all donor and acceptors in a given string,
39 // regardless of whether they are within or near an initially mapped
40 // region.
41 template<typename TStr>
42 class SimpleIntronFinder
43 {
44 vector<SpliceMotif> _hits;
45
46 public:
47 // This should take three TStr, and we should be using some search function
48 // of seqan to look up the hits. Don't ask.
49 SimpleIntronFinder(TStr& ref_str, const string& left_motif, const string& right_motif)
50 {
51 _hits.clear();
52
53 for (size_t i = 0; i < length(ref_str) - 1; ++i)
54 {
55 // Look at a slice of the reference without creating a copy.
56 seqan::Infix<RefSequenceTable::Sequence>::Type curr = seqan::infix(ref_str,i, i + 2);
57 if (curr == left_motif)
58 _hits.push_back(SpliceMotif(i, true));
59 else if (curr == right_motif)
60 _hits.push_back(SpliceMotif(i, false));
61 }
62 }
63
64 const vector<size_t>& hits() const { return _hits; }
65 };
66
67 uint32_t searched = 0;
68 uint32_t closed = 0;
69
70 typedef std::set<Junction, skip_count_lt > ClosureJunctionSet;
71
72 template<typename TStr, typename IntronFinder>
73 class JunctionFinder
74 {
75 public:
76 JunctionFinder(const IntronFinder& intron_finder,
77 uint32_t inner_dist_mean,
78 uint32_t inner_dist_std_dev,
79 uint32_t min_intron_length,
80 uint32_t min_exon_length,
81 uint32_t bowtie_overlap_padding = 0,
82 uint32_t max_gap = 0,
83 uint32_t max_juncs = 7500000 /*this is the max juncs from a single strand*/) :
84 _inner_dist_mean(inner_dist_mean),
85 _inner_dist_std_dev(inner_dist_std_dev),
86 _min_intron_len(min_intron_length),
87 _min_exon_len(min_exon_length),
88 _motif_hits(intron_finder.hits()),
89 _padding(bowtie_overlap_padding),
90 _max_gap(max_gap),
91 _max_juncs(max_juncs)
92 {
93 _left_memo.resize(_motif_hits.size());
94 _right_memo.resize(_motif_hits.size());
95 }
96
97 void possible_junctions(ClosureJunctionSet& potential_splices,
98 const BowtieHit& h1,
99 const BowtieHit& h2)
100 {
101 assert (h1.ref_id() == h2.ref_id());
102
103 int minor_hit_start, major_hit_start;
104 int minor_hit_end, major_hit_end;
105 if (h1.left() < h2.left())
106 {
107 minor_hit_start = (int)h1.left();
108 minor_hit_end = (int)h1.right();
109 major_hit_start = (int)h2.left();
110 major_hit_end = (int)h2.right();
111 }
112 else
113 {
114 minor_hit_start = (int)h2.left();
115 minor_hit_end = (int)h2.right();
116 major_hit_start = (int)h1.left();
117 major_hit_end = (int)h1.right();
118 }
119
120 int inner_dist = major_hit_start - minor_hit_end;
121
122 if (inner_dist > (int)max_closure_intron_length ||
123 inner_dist < (int)min_closure_intron_length)
124 return;
125 // Bowtie allows some mismatches, so we need to allow for the fact that
126 // the donor/acceptor is actually "inside" the alignment for one or both
127 // ends of the insert. That is, intron may be longer than the pair's
128 // inner distance in the genomic coordinate space. This variable
129 // padding number accounts for this fact in a silly way that could be
130 // improved.
131
132 int expected_inner_dist = _inner_dist_mean;
133
134 if (abs(inner_dist - expected_inner_dist) <= (int)_inner_dist_std_dev)
135 return;
136
137 ++searched;
138
139 if (search(potential_splices, h1.ref_id(), minor_hit_end, major_hit_start, expected_inner_dist))
140 ++closed;
141 }
142
143 private:
144
145 vector<SpliceMotif>::const_iterator _motif_upper_bound;
146 vector<SpliceMotif>::const_iterator _motif_lower_bound;
147
148 /* The following computation identifies possible splice sites.
149
150 |||||||||||||||-----------------------------------------||||||||||||||||
151 read_len inner_dist read_len
152 Where r is the length of a read and x is the *internal* distance between
153 mates in the *genomic* coordinate space. We first check to see whether
154 x is too long to be an insert from a single contiguous substring of the
155 genome. That is, if x is longer than you'd expect from a unspliced
156 molecule, than the insert likely spans a splice junction.
157
158 The expected value of x when the insert doesn't span a splice is the
159 library insert size mean (I) minus twice the read length, plus or minus
160 the library insert size standard deviation. If x is longer than that,
161 the insert probably spans a splice.
162
163 For an insert that spans a splice junction, that junction must fall between
164 the ends of the mates. Let the right side of the alignment of the left
165 read be called minor_hit_end, and the left side of the alignment of the right
166 read be major_hit_start. Then an insert's inner_dist = major_hit_start -
167 minor_hit_end. Let the expected inner distance for a insert that doesn't
168 cross a junction be Insert_mean - 2* read_len. Then a splice-crossing insert
169 must have inner_dist >= expected_inner_dist.
170
171 Let the actual splice position = (splice_left, splice_right). Then
172 (splice_left - minor_hit_end) + (major_hit_start - splice_right) =
173 expected_inner_dist +/- std_dev.
174 */
175
176 /*
177
178 The search for possible junctions works as follows. Given two genomic
179 coordinates, start, and ref_target, and an expected inner distance between
180 their projections in the transcriptome coordinate space, we can enumerate
181 paths through the genome that represent possible spliced transcript
182 segments and connect start and ref_target. That is, assuming start and
183 ref_target are both in exonic regions, we can find all possible
184 genomic subsequences formed by concatenating segments flanked by an "AG" on
185 the left and a "GT" on the right. The resulting concatentation is a
186 possible 'closure' if its length is approximately equal to the expected
187 inner distance between mate pairs for this library.
188
189 To actually perform the search, this class uses two recursive functions
190 that call each other in an alternating fashion. search_hop_left_motif takes
191 a genomic coordinate lstart and finds paths to ref_target that start with
192 a "GT" that occurs in (lstart, ref_target]. search_hop_right_motif takes a
193 genomic coordinate rstart and finds paths to ref_target that start with an
194 "AG" that occurs in (rstart, ref_target]. A valid path from start to
195 ref_target must have a "GT" on the left and an "AG" on the right, so valid
196 paths may only be found in a call to search_hop_right_motif.
197
198 */
199
200 bool search(ClosureJunctionSet& splices,
201 uint32_t ref_id,
202 uint32_t start,
203 uint32_t ref_target,
204 int expected_inner_dist)
205 {
206 uint32_t left_start_pos = max((int)start - (int)_padding, 0);
207 uint32_t right_end_pos = ref_target + _padding;
208
209 SpliceMotif left_start(left_start_pos, true);
210 SpliceMotif right_end(right_end_pos, false);
211
212
213 _motif_lower_bound = lower_bound(_motif_hits.begin(),
214 _motif_hits.end(),
215 left_start);
216
217 _motif_upper_bound = upper_bound(_motif_hits.begin(),
218 _motif_hits.end(),
219 right_end);
220
221 //fprintf(stderr, "Curr junctions = %d\n", splices.size());
222
223 /* TODO: consider some kind of long-term memo. Closures found at the
224 top level of recursion here can be saved and potentially reused in
225 future searches on overlapping intervals.
226 */
227 bool c = false;
228 if (search_hop_left_motif(splices,
229 ref_id,
230 _motif_lower_bound,
231 right_end_pos,
232 expected_inner_dist,
233 0,
234 true) != -1)
235 c = true;
236
237 /* Clear left short-term memo */
238 vector<vector<int> >::iterator _lm_begin = _left_memo.begin() +
239 (_motif_lower_bound - _motif_hits.begin());
240
241 vector<vector<int> >::iterator _lm_end = _left_memo.begin() +
242 (_motif_upper_bound - _motif_hits.begin()) + 1;
243
244 _lm_end = min(_lm_end, _left_memo.end());
245
246 /* Clear right short-term memo */
247 vector<vector<int> >::iterator _rm_begin = _right_memo.begin() +
248 (_motif_lower_bound - _motif_hits.begin());
249
250 vector<vector<int> >::iterator _rm_end = _right_memo.begin() +
251 (_motif_upper_bound - _motif_hits.begin()) + 1;
252
253 _rm_end = min(_rm_end, _right_memo.end());
254
255 if (_lm_begin < _lm_end)
256 fill(_lm_begin, _lm_end, vector<int>());
257
258 if (_rm_begin < _rm_end)
259 fill(_rm_begin, _rm_end, vector<int>());
260
261 while(splices.size() > _max_juncs)
262 {
263 splices.erase(*splices.rbegin());
264 }
265
266 return c;
267 }
268
269 inline int _abs(int x)
270 {
271 return (x > 0) ? x : -x;
272 }
273
274 inline int check_memo(const vector<vector<int> >& memo,
275 size_t motif_pos,
276 int curr_path_distance,
277 int target_path_distance,
278 int tolerance)
279 {
280 const vector<int>& distances = memo[motif_pos];
281 for (size_t i = 0; i < distances.size(); ++i)
282 {
283 int dist = distances[i];
284 if (_abs(curr_path_distance + dist - target_path_distance) < tolerance)
285 return dist;
286 }
287 return -1;
288 }
289
290 int search_hop_left_motif(ClosureJunctionSet& splices,
291 uint32_t ref_id,
292 vector<SpliceMotif>::const_iterator start,
293 int ref_target,
294 int expected_inner_dist,
295 uint32_t curr_path_length,
296 bool initial_hop = false)
297 {
298 int in_valid_path = -1;
299 // SpliceMotif left_bound(start + (initial_hop ? 0 : _min_exon_len), true);
300 // SpliceMotif right_bound(start + (expected_inner_dist - curr_path_length) + (2 * _padding) + _max_gap, true);
301
302 vector<SpliceMotif>::const_iterator lb_left = start;
303
304 while (lb_left <= _motif_upper_bound &&
305 lb_left < _motif_hits.end() &&
306 (lb_left->pos - start->pos < (initial_hop ? 0 :(int) _min_exon_len) || !lb_left->left_motif))
307 {
308 lb_left++;
309 }
310
311
312 for (vector<SpliceMotif>::const_iterator li = lb_left;
313 li != _motif_hits.end();
314 ++li)
315 {
316 if (!li->left_motif) //skip over the right motifs
317 continue;
318 //assert (*li >= SpliceMotif(start, true) || initial_hop);
319 int dist = (int)li->pos - (int)start->pos;
320 int next_path_len = curr_path_length + dist;
321
322 int tolerance = _inner_dist_std_dev + 2 * _padding + _max_gap;
323
324 if (next_path_len - expected_inner_dist > tolerance)
325 {
326 return in_valid_path;
327 }
328
329 int next = li->pos;
330 //assert(next + _min_intron_len <= ref_target);
331 if(next + (int)_min_intron_len <= ref_target)
332 {
333 int ret = -1;
334 ret = check_memo(_left_memo,
335 li - _motif_hits.begin(),
336 curr_path_length + dist,
337 expected_inner_dist,
338 tolerance);
339 if (ret != -1)
340 {
341 in_valid_path = ret + dist;
342 _left_memo[li - _motif_hits.begin()].push_back(in_valid_path);
343 }
344 else
345 {
346 ret = search_hop_right_motif(splices,
347 ref_id,
348 li,
349 ref_target,
350 expected_inner_dist,
351 next_path_len);
352 if (ret != -1)
353 {
354 in_valid_path = ret + dist;
355 _left_memo[li - _motif_hits.begin()].push_back(in_valid_path);
356 }
357 }
358 }
359
360 }
361
362 return in_valid_path;
363
364 }
365
366 int search_hop_right_motif(ClosureJunctionSet& splices,
367 uint32_t ref_id,
368 vector<SpliceMotif>::const_iterator start,
369 int ref_target,
370 int expected_inner_dist,
371 uint32_t curr_path_length)
372 {
373 int in_valid_path = -1;
374
375 // SpliceMotif left_start(start + _min_intron_len - 2, false);
376 // SpliceMotif right_start(ref_target, false);
377 if (ref_target < start->pos)
378 {
379 return in_valid_path;
380 }
381 vector<SpliceMotif>::const_iterator lb_right = start;
382 while (lb_right <= _motif_upper_bound &&
383 lb_right < _motif_hits.end() &&
384 (lb_right->pos - start->pos < (int)_min_intron_len - 2 || lb_right->left_motif))
385 {
386 lb_right++;
387 }
388
389 for (vector<SpliceMotif>::const_iterator ri = lb_right;
390 ri != _motif_hits.end();
391 ++ri)
392 {
393 if (ri->left_motif) //skip over the left motifs
394 continue;
395 if (ri->pos > ref_target)
396 {
397 return in_valid_path;
398 }
399 int next = ri->pos + 2;
400 int final_hop_dist = (int)ref_target - (int)next;
401 int final_path_len = curr_path_length + final_hop_dist;
402
403 int tolerance = _inner_dist_std_dev + 2 * _padding + _max_gap;
404
405 if (abs(expected_inner_dist - final_path_len) <= tolerance)
406 {
407 splices.insert(Junction(ref_id,
408 start->pos - 1,
409 next,
410 false,
411 start->pos - 1 - next));
412 in_valid_path = final_hop_dist;
413 _right_memo[ri - _motif_hits.begin()].push_back(final_hop_dist);
414 }
415 else
416 {
417 int ret = -1;
418 ret = check_memo(_right_memo,
419 ri - _motif_hits.begin(),
420 curr_path_length,
421 expected_inner_dist,
422 tolerance);
423 if (ret != -1)
424 {
425 splices.insert(Junction(ref_id,
426 start->pos - 1,
427 next,
428 false,
429 start->pos - 1 - next));
430
431 in_valid_path = ret;
432 _right_memo[ri - _motif_hits.begin()].push_back(ret);
433 }
434 else
435 {
436
437 ret = search_hop_left_motif(splices,
438 ref_id,
439 ri,
440 ref_target,
441 expected_inner_dist,
442 curr_path_length);
443 if (ret != -1)
444 {
445 splices.insert(Junction(ref_id,
446 start->pos - 1,
447 next,
448 false,
449 start->pos - 1 - next));
450
451 in_valid_path = ret;
452 _right_memo[ri - _motif_hits.begin()].push_back(in_valid_path);
453 }
454 }
455 }
456 }
457
458 return in_valid_path;
459 }
460
461 uint32_t _inner_dist_mean;
462 uint32_t _inner_dist_std_dev;
463 uint32_t _min_intron_len;
464 uint32_t _min_exon_len;
465
466 vector<SpliceMotif> _motif_hits;
467 uint32_t _padding;
468 uint32_t _max_gap;
469 uint32_t _max_juncs;
470 vector<vector<int> > _left_memo;
471 vector<vector<int> > _right_memo;
472 };
473
474 typedef std::set<pair<size_t, size_t> > CoordSet;
475 void check_mates(const HitList& hits1_in_ref,
476 const HitList& hits2_in_ref,
477 vector<pair<size_t, size_t> >& happy_mates,
478 vector<size_t>& map1_singletons,
479 vector<size_t>& map2_singletons);
480
481 #endif