ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/junctions.cpp
Revision: 200
Committed: Fri Mar 9 22:25:25 2012 UTC (12 years, 5 months ago) by gpertea
File size: 10194 byte(s)
Log Message:
Line File contents
1 /*
2 * junctions.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 12/12/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include "common.h"
16 #include "junctions.h"
17 #include "bwt_map.h"
18
19 void junctions_from_spliced_hit(const BowtieHit& h,
20 vector<pair<Junction, JunctionStats> >& new_juncs)
21 {
22 const vector<CigarOp>& cigar = h.cigar();
23 int j = h.left();
24
25 bool bSawFusion = false;
26 for (size_t c = 0 ; c < cigar.size(); ++c)
27 {
28 Junction junc;
29 JunctionStats stats;
30
31 int opcode = cigar[c].opcode;
32 int length = cigar[c].length;
33
34 switch(opcode)
35 {
36 case REF_SKIP:
37 case rEF_SKIP:
38 if (bSawFusion)
39 junc.refid = h.ref_id2();
40 else
41 junc.refid = h.ref_id();
42
43 // daehwan - we need to consider indels very next to REF_SKIP,
44 // which is possible due to Bowtie2
45 assert (c > 0 && c < cigar.size() - 1);
46 assert (cigar[c - 1].length);
47 assert (cigar[c + 1].length);
48
49 if (opcode == REF_SKIP)
50 {
51 junc.left = j - 1;
52 junc.right = j + length;
53 stats.left_extent = cigar[c - 1].length;
54 stats.right_extent = cigar[c + 1].length;
55 j += length;
56 }
57 else
58 {
59 junc.right = j + 1;
60 junc.left = j - length;
61 stats.right_extent = cigar[c - 1].length;
62 stats.left_extent = cigar[c + 1].length;
63 j -= length;
64 }
65
66 junc.antisense = h.antisense_splice();
67
68 /*
69 * Note that in valid_hit() in tophat_report.cpp
70 * we have tried to ensure that the REF_SKIP operator
71 * will only be surrounded by match operators
72 */
73
74 stats.min_splice_mms = h.splice_mms();
75 stats.supporting_hits++;
76 new_juncs.push_back(make_pair(junc, stats));
77 break;
78 case MATCH:
79 case DEL:
80 j += cigar[c].length;
81 break;
82 case mATCH:
83 case dEL:
84 j -= cigar[c].length;
85 break;
86 case FUSION_FF:
87 case FUSION_FR:
88 case FUSION_RF:
89 j = cigar[c].length;
90 bSawFusion = true;
91 break;
92 default:
93 break;
94 }
95 }
96 }
97
98 void print_junction(FILE* junctions_out,
99 const char* name,
100 const Junction& j,
101 const JunctionStats& s,
102 uint64_t junc_id)
103 {
104 int left_plus_one = j.left + 1;
105 fprintf(junctions_out,
106 "%s\t%d\t%d\tJUNC%08d\t%d\t%c\t%d\t%d\t255,0,0\t2\t%d,%d\t0,%d\n",
107 name,
108 left_plus_one - s.left_extent,
109 j.right + s.right_extent,
110 (int)junc_id,
111 s.supporting_hits,
112 j.antisense ? '-' : '+',
113 left_plus_one - s.left_extent,
114 j.right + s.right_extent,
115 s.left_extent,
116 s.right_extent,
117 j.right - (left_plus_one - s.left_extent));
118 }
119
120 void junctions_from_alignment(const BowtieHit& spliced_alignment,
121 JunctionSet& junctions)
122 {
123 vector<pair<Junction, JunctionStats> > juncs;
124 junctions_from_spliced_hit(spliced_alignment, juncs);
125
126 for (size_t i = 0; i < juncs.size(); ++i)
127 {
128 pair<Junction, JunctionStats>& junc = juncs[i];
129 JunctionSet::iterator itr = junctions.find(junc.first);
130
131 if (itr != junctions.end())
132 {
133 JunctionStats& j = itr->second;
134 j.merge_with(junc.second);
135 }
136 else
137 {
138 assert(junc.first.refid != VMAXINT32);
139 junctions[junc.first] = junc.second;
140 }
141 }
142 }
143
144 #if !NDEBUG
145 void validate_junctions(const JunctionSet& junctions)
146 {
147 uint32_t invalid_juncs = 0;
148 for (JunctionSet::const_iterator i = junctions.begin();
149 i != junctions.end();
150 ++i)
151 {
152 if (!i->first.valid())
153 invalid_juncs++;
154 }
155 fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs);
156 }
157 #endif
158
159 int rejected = 0;
160 int rejected_spliced = 0;
161 int total_spliced = 0;
162 int total = 0;
163
164 /*
165 void junctions_from_alignments(HitTable& hits,
166 JunctionSet& junctions)
167 {
168 for (HitTable::iterator ci = hits.begin();
169 ci != hits.end();
170 ++ci)
171 {
172 HitList& rh = ci->second;
173 if (rh.size() == 0)
174 continue;
175 for (size_t i = 0; i < rh.size(); ++i)
176 {
177 BowtieHit& bh = rh[i];
178 AlignStatus s = status(&bh);
179 total++;
180 if (s == SPLICED)
181 total_spliced++;
182
183 if (s == SPLICED)
184 {
185 junctions_from_alignment(bh, junctions);
186 }
187 }
188 }
189 }
190 */
191
192 bool accept_if_valid(const Junction& j, JunctionStats& s)
193 {
194 if (min(s.left_extent, s.right_extent) < min_anchor_len)
195 {
196 s.accepted = false;
197 return false;
198 }
199
200 if (s.min_splice_mms > max_splice_mismatches)
201 {
202 s.accepted = false;
203 return false;
204 }
205
206 // uint32_t junc_doc = 0;
207 // uint8_t extent = 0;
208 // if (s.left_exon_doc > s.right_exon_doc)
209 // {
210 // junc_doc = s.left_exon_doc;
211 // extent = s.left_extent;
212 // }
213 // else
214 // {
215 // junc_doc = s.right_exon_doc;
216 // extent = s.right_extent;
217 // }
218 //
219 // double avg_junc_doc = junc_doc / (double)(extent);
220
221 //if (avg_junc_doc / (float) s.num_reads > 100.0)
222 // if (s.supporting_hits / avg_junc_doc < min_isoform_fraction)
223 // {
224 // s.accepted = false;
225 // }
226 // else
227 {
228 //fprintf (stderr, "Junction size = %d\n, support = %d", (int)j.right - (int)j.left, (int)s.supporting_hits.size() );
229 if ((int)j.right - (int)j.left > 50000)
230 {
231 s.accepted = (s.supporting_hits >= 2 &&
232 min(s.left_extent, s.right_extent) > 12);
233 }
234 else
235 {
236 s.accepted = true;
237 }
238 }
239 return s.accepted;
240 }
241
242 void knockout_shadow_junctions(JunctionSet& junctions)
243 {
244 vector<uint32_t> ref_ids;
245
246 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
247 {
248 ref_ids.push_back(i->first.refid);
249 }
250
251 sort(ref_ids.begin(), ref_ids.end());
252 vector<uint32_t>::iterator new_end = unique(ref_ids.begin(), ref_ids.end());
253 ref_ids.erase(new_end, ref_ids.end());
254
255 for(size_t i = 0; i < ref_ids.size(); ++i)
256 {
257 uint32_t refid = ref_ids[i];
258
259 Junction dummy_left(refid, 0, 0, true);
260 Junction dummy_right(refid, VMAXINT32, VMAXINT32, true);
261
262 pair<JunctionSet::iterator, JunctionSet::iterator> r;
263 r.first = junctions.lower_bound(dummy_left);
264 r.second = junctions.upper_bound(dummy_right);
265
266 JunctionSet::iterator itr = r.first;
267
268 while(itr != r.second && itr != junctions.end())
269 {
270 if (itr->second.accepted && !itr->second.gtf_match)
271 {
272 Junction fuzzy_left = itr->first;
273 Junction fuzzy_right = itr->first;
274 fuzzy_left.left -= min_anchor_len;
275 fuzzy_right.right += min_anchor_len;
276 fuzzy_left.antisense = !itr->first.antisense;
277 fuzzy_right.antisense = !itr->first.antisense;
278
279 pair<JunctionSet::iterator, JunctionSet::iterator> s;
280 s.first = junctions.lower_bound(fuzzy_left);
281 s.second = junctions.upper_bound(fuzzy_right);
282 JunctionSet::iterator itr2 = s.first;
283
284 int junc_support = itr->second.supporting_hits;
285
286 while(itr2 != s.second && itr2 != junctions.end())
287 {
288 int left_diff = itr->first.left - itr2->first.left;
289 int right_diff = itr->first.right - itr2->first.right;
290 if (itr != itr2 &&
291 itr->first.antisense != itr2->first.antisense &&
292 (left_diff < min_anchor_len || right_diff < min_anchor_len))
293 {
294 if (junc_support < itr2->second.supporting_hits)
295 itr->second.accepted = false;
296 }
297 ++itr2;
298 }
299 }
300 ++itr;
301 }
302 }
303 }
304
305 void filter_junctions(JunctionSet& junctions, const JunctionSet& gtf_junctions)
306 {
307 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
308 {
309 if (gtf_junctions.find(i->first) == gtf_junctions.end())
310 accept_if_valid(i->first, i->second);
311 else {//automatically accept junctions matching GTF
312 i->second.accepted = true;
313 i->second.gtf_match = true;
314 }
315 }
316
317 knockout_shadow_junctions(junctions);
318 }
319
320 void accept_all_junctions(JunctionSet& junctions,
321 const uint32_t refid)
322 {
323 fprintf(stderr, "Accepting all junctions\n");
324 for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr)
325 {
326 itr->second.accepted = true;
327 }
328 }
329
330
331 void print_junctions(FILE* junctions_out,
332 const JunctionSet& junctions,
333 RefSequenceTable& ref_sequences)
334 {
335 uint64_t junc_id = 1;
336 fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n");
337 for (JunctionSet::const_iterator i = junctions.begin();
338 i != junctions.end();
339 ++i)
340 {
341 const pair<Junction, JunctionStats>& j_itr = *i;
342 const Junction& j = j_itr.first;
343 const JunctionStats& s = j_itr.second;
344
345 assert(ref_sequences.get_name(j.refid));
346 //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+');
347 print_junction(junctions_out,
348 ref_sequences.get_name(j.refid),
349 j,
350 s,
351 junc_id++);
352 }
353 //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced);
354 }
355
356 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
357 // resets the stream when finished.
358 void get_junctions_from_hits(HitStream& hit_stream,
359 ReadTable& it,
360 JunctionSet& junctions)
361 {
362 HitsForRead curr_hit_group;
363 hit_stream.next_read_hits(curr_hit_group);
364
365 uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id);
366
367 while(curr_obs_order != VMAXINT32)
368 {
369 for (size_t i = 0; i < curr_hit_group.hits.size(); ++i)
370 {
371 BowtieHit& bh = curr_hit_group.hits[i];
372 if (!bh.contiguous())
373 {
374 junctions_from_alignment(bh, junctions);
375 }
376 hit_stream.next_read_hits(curr_hit_group);
377 curr_obs_order = it.observation_order(curr_hit_group.insert_id);
378 }
379 }
380
381 hit_stream.reset();
382 }
383
384 void merge_with(JunctionSet& juncs, const JunctionSet& other_juncs)
385 {
386 for (JunctionSet::const_iterator junc = other_juncs.begin(); junc != other_juncs.end(); ++junc)
387 {
388 JunctionSet::iterator itr = juncs.find(junc->first);
389 if (itr != juncs.end())
390 {
391 JunctionStats& curr = itr->second;
392 curr.merge_with(junc->second);
393 }
394 else
395 {
396 juncs[junc->first] = junc->second;
397 }
398 }
399 }