ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/junctions.cpp
Revision: 200
Committed: Fri Mar 9 22:25:25 2012 UTC (12 years, 5 months ago) by gpertea
File size: 10194 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 29 /*
2     * junctions.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 12/12/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include "common.h"
16     #include "junctions.h"
17 gpertea 200 #include "bwt_map.h"
18 gpertea 29
19     void junctions_from_spliced_hit(const BowtieHit& h,
20     vector<pair<Junction, JunctionStats> >& new_juncs)
21     {
22     const vector<CigarOp>& cigar = h.cigar();
23     int j = h.left();
24 gpertea 154
25     bool bSawFusion = false;
26 gpertea 29 for (size_t c = 0 ; c < cigar.size(); ++c)
27     {
28     Junction junc;
29     JunctionStats stats;
30 gpertea 200
31     int opcode = cigar[c].opcode;
32     int length = cigar[c].length;
33    
34     switch(opcode)
35 gpertea 29 {
36     case REF_SKIP:
37 gpertea 154 case rEF_SKIP:
38     if (bSawFusion)
39     junc.refid = h.ref_id2();
40     else
41     junc.refid = h.ref_id();
42    
43 gpertea 200 // daehwan - we need to consider indels very next to REF_SKIP,
44     // which is possible due to Bowtie2
45     assert (c > 0 && c < cigar.size() - 1);
46     assert (cigar[c - 1].length);
47     assert (cigar[c + 1].length);
48    
49     if (opcode == REF_SKIP)
50 gpertea 154 {
51 gpertea 200 junc.left = j - 1;
52     junc.right = j + length;
53     stats.left_extent = cigar[c - 1].length;
54     stats.right_extent = cigar[c + 1].length;
55     j += length;
56 gpertea 154 }
57     else
58     {
59     junc.right = j + 1;
60 gpertea 200 junc.left = j - length;
61     stats.right_extent = cigar[c - 1].length;
62     stats.left_extent = cigar[c + 1].length;
63     j -= length;
64 gpertea 154 }
65 gpertea 29
66     junc.antisense = h.antisense_splice();
67 gpertea 154
68 gpertea 29 /*
69     * Note that in valid_hit() in tophat_report.cpp
70     * we have tried to ensure that the REF_SKIP operator
71     * will only be surrounded by match operators
72     */
73 gpertea 200
74 gpertea 29 stats.min_splice_mms = h.splice_mms();
75     stats.supporting_hits++;
76     new_juncs.push_back(make_pair(junc, stats));
77     break;
78     case MATCH:
79     case DEL:
80     j += cigar[c].length;
81     break;
82 gpertea 154 case mATCH:
83     case dEL:
84     j -= cigar[c].length;
85     break;
86     case FUSION_FF:
87     case FUSION_FR:
88     case FUSION_RF:
89     j = cigar[c].length;
90     bSawFusion = true;
91     break;
92 gpertea 29 default:
93     break;
94     }
95     }
96     }
97    
98     void print_junction(FILE* junctions_out,
99     const char* name,
100     const Junction& j,
101     const JunctionStats& s,
102     uint64_t junc_id)
103     {
104 gpertea 200 int left_plus_one = j.left + 1;
105 gpertea 29 fprintf(junctions_out,
106     "%s\t%d\t%d\tJUNC%08d\t%d\t%c\t%d\t%d\t255,0,0\t2\t%d,%d\t0,%d\n",
107     name,
108 gpertea 200 left_plus_one - s.left_extent,
109 gpertea 29 j.right + s.right_extent,
110     (int)junc_id,
111     s.supporting_hits,
112     j.antisense ? '-' : '+',
113 gpertea 200 left_plus_one - s.left_extent,
114 gpertea 29 j.right + s.right_extent,
115     s.left_extent,
116     s.right_extent,
117 gpertea 200 j.right - (left_plus_one - s.left_extent));
118 gpertea 29 }
119    
120     void junctions_from_alignment(const BowtieHit& spliced_alignment,
121     JunctionSet& junctions)
122     {
123     vector<pair<Junction, JunctionStats> > juncs;
124     junctions_from_spliced_hit(spliced_alignment, juncs);
125 gpertea 154
126 gpertea 29 for (size_t i = 0; i < juncs.size(); ++i)
127     {
128     pair<Junction, JunctionStats>& junc = juncs[i];
129     JunctionSet::iterator itr = junctions.find(junc.first);
130    
131     if (itr != junctions.end())
132     {
133     JunctionStats& j = itr->second;
134 gpertea 154 j.merge_with(junc.second);
135 gpertea 29 }
136     else
137     {
138 gpertea 135 assert(junc.first.refid != VMAXINT32);
139 gpertea 29 junctions[junc.first] = junc.second;
140     }
141     }
142     }
143    
144     #if !NDEBUG
145     void validate_junctions(const JunctionSet& junctions)
146     {
147     uint32_t invalid_juncs = 0;
148     for (JunctionSet::const_iterator i = junctions.begin();
149     i != junctions.end();
150     ++i)
151     {
152     if (!i->first.valid())
153     invalid_juncs++;
154     }
155     fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs);
156     }
157     #endif
158    
159     int rejected = 0;
160     int rejected_spliced = 0;
161     int total_spliced = 0;
162     int total = 0;
163    
164     /*
165     void junctions_from_alignments(HitTable& hits,
166     JunctionSet& junctions)
167     {
168     for (HitTable::iterator ci = hits.begin();
169     ci != hits.end();
170     ++ci)
171     {
172     HitList& rh = ci->second;
173     if (rh.size() == 0)
174     continue;
175     for (size_t i = 0; i < rh.size(); ++i)
176     {
177     BowtieHit& bh = rh[i];
178     AlignStatus s = status(&bh);
179     total++;
180     if (s == SPLICED)
181     total_spliced++;
182    
183     if (s == SPLICED)
184     {
185     junctions_from_alignment(bh, junctions);
186     }
187     }
188     }
189     }
190     */
191    
192     bool accept_if_valid(const Junction& j, JunctionStats& s)
193     {
194     if (min(s.left_extent, s.right_extent) < min_anchor_len)
195     {
196     s.accepted = false;
197     return false;
198     }
199    
200     if (s.min_splice_mms > max_splice_mismatches)
201     {
202     s.accepted = false;
203     return false;
204     }
205    
206     // uint32_t junc_doc = 0;
207     // uint8_t extent = 0;
208     // if (s.left_exon_doc > s.right_exon_doc)
209     // {
210     // junc_doc = s.left_exon_doc;
211     // extent = s.left_extent;
212     // }
213     // else
214     // {
215     // junc_doc = s.right_exon_doc;
216     // extent = s.right_extent;
217     // }
218     //
219     // double avg_junc_doc = junc_doc / (double)(extent);
220    
221     //if (avg_junc_doc / (float) s.num_reads > 100.0)
222     // if (s.supporting_hits / avg_junc_doc < min_isoform_fraction)
223     // {
224     // s.accepted = false;
225     // }
226     // else
227     {
228     //fprintf (stderr, "Junction size = %d\n, support = %d", (int)j.right - (int)j.left, (int)s.supporting_hits.size() );
229     if ((int)j.right - (int)j.left > 50000)
230     {
231     s.accepted = (s.supporting_hits >= 2 &&
232     min(s.left_extent, s.right_extent) > 12);
233     }
234     else
235     {
236     s.accepted = true;
237     }
238     }
239     return s.accepted;
240     }
241    
242     void knockout_shadow_junctions(JunctionSet& junctions)
243     {
244     vector<uint32_t> ref_ids;
245    
246     for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
247     {
248     ref_ids.push_back(i->first.refid);
249     }
250    
251     sort(ref_ids.begin(), ref_ids.end());
252     vector<uint32_t>::iterator new_end = unique(ref_ids.begin(), ref_ids.end());
253     ref_ids.erase(new_end, ref_ids.end());
254    
255     for(size_t i = 0; i < ref_ids.size(); ++i)
256     {
257     uint32_t refid = ref_ids[i];
258    
259     Junction dummy_left(refid, 0, 0, true);
260 gpertea 135 Junction dummy_right(refid, VMAXINT32, VMAXINT32, true);
261 gpertea 29
262     pair<JunctionSet::iterator, JunctionSet::iterator> r;
263     r.first = junctions.lower_bound(dummy_left);
264     r.second = junctions.upper_bound(dummy_right);
265    
266     JunctionSet::iterator itr = r.first;
267    
268     while(itr != r.second && itr != junctions.end())
269 gpertea 135 {
270     if (itr->second.accepted && !itr->second.gtf_match)
271     {
272     Junction fuzzy_left = itr->first;
273     Junction fuzzy_right = itr->first;
274     fuzzy_left.left -= min_anchor_len;
275     fuzzy_right.right += min_anchor_len;
276     fuzzy_left.antisense = !itr->first.antisense;
277     fuzzy_right.antisense = !itr->first.antisense;
278    
279     pair<JunctionSet::iterator, JunctionSet::iterator> s;
280     s.first = junctions.lower_bound(fuzzy_left);
281     s.second = junctions.upper_bound(fuzzy_right);
282     JunctionSet::iterator itr2 = s.first;
283    
284     int junc_support = itr->second.supporting_hits;
285    
286     while(itr2 != s.second && itr2 != junctions.end())
287     {
288     int left_diff = itr->first.left - itr2->first.left;
289     int right_diff = itr->first.right - itr2->first.right;
290     if (itr != itr2 &&
291     itr->first.antisense != itr2->first.antisense &&
292     (left_diff < min_anchor_len || right_diff < min_anchor_len))
293     {
294     if (junc_support < itr2->second.supporting_hits)
295     itr->second.accepted = false;
296     }
297     ++itr2;
298     }
299     }
300     ++itr;
301     }
302 gpertea 29 }
303     }
304    
305     void filter_junctions(JunctionSet& junctions, const JunctionSet& gtf_junctions)
306     {
307     for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
308     {
309     if (gtf_junctions.find(i->first) == gtf_junctions.end())
310 gpertea 135 accept_if_valid(i->first, i->second);
311     else {//automatically accept junctions matching GTF
312     i->second.accepted = true;
313     i->second.gtf_match = true;
314     }
315 gpertea 29 }
316    
317     knockout_shadow_junctions(junctions);
318     }
319    
320     void accept_all_junctions(JunctionSet& junctions,
321     const uint32_t refid)
322     {
323     fprintf(stderr, "Accepting all junctions\n");
324     for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr)
325     {
326     itr->second.accepted = true;
327     }
328     }
329    
330    
331     void print_junctions(FILE* junctions_out,
332     const JunctionSet& junctions,
333     RefSequenceTable& ref_sequences)
334     {
335     uint64_t junc_id = 1;
336     fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n");
337     for (JunctionSet::const_iterator i = junctions.begin();
338     i != junctions.end();
339     ++i)
340     {
341     const pair<Junction, JunctionStats>& j_itr = *i;
342     const Junction& j = j_itr.first;
343     const JunctionStats& s = j_itr.second;
344    
345     assert(ref_sequences.get_name(j.refid));
346     //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+');
347     print_junction(junctions_out,
348     ref_sequences.get_name(j.refid),
349     j,
350     s,
351     junc_id++);
352     }
353     //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced);
354     }
355    
356     // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
357     // resets the stream when finished.
358     void get_junctions_from_hits(HitStream& hit_stream,
359     ReadTable& it,
360     JunctionSet& junctions)
361     {
362     HitsForRead curr_hit_group;
363     hit_stream.next_read_hits(curr_hit_group);
364    
365     uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id);
366    
367 gpertea 135 while(curr_obs_order != VMAXINT32)
368 gpertea 29 {
369     for (size_t i = 0; i < curr_hit_group.hits.size(); ++i)
370     {
371     BowtieHit& bh = curr_hit_group.hits[i];
372     if (!bh.contiguous())
373     {
374     junctions_from_alignment(bh, junctions);
375     }
376     hit_stream.next_read_hits(curr_hit_group);
377     curr_obs_order = it.observation_order(curr_hit_group.insert_id);
378     }
379     }
380    
381     hit_stream.reset();
382     }
383 gpertea 154
384     void merge_with(JunctionSet& juncs, const JunctionSet& other_juncs)
385     {
386     for (JunctionSet::const_iterator junc = other_juncs.begin(); junc != other_juncs.end(); ++junc)
387     {
388     JunctionSet::iterator itr = juncs.find(junc->first);
389     if (itr != juncs.end())
390     {
391     JunctionStats& curr = itr->second;
392     curr.merge_with(junc->second);
393     }
394     else
395     {
396     juncs[junc->first] = junc->second;
397     }
398     }
399     }