ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/insertions.cpp
Revision: 239
Committed: Mon May 14 21:01:08 2012 UTC (12 years, 3 months ago) by gpertea
File size: 5798 byte(s)
Log Message:
wip prep_reads.cpp, tophat

Line File contents
1 /*
2 * insertions.cpp
3 * TopHat
4 *
5 * Created by Ryan Kelley on 11/04/2010.
6 *
7 */
8
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #else
12 #define PACKAGE_VERSION "INTERNAL"
13 #define SVN_REVISION "XXX"
14 #endif
15
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include "common.h"
34 #include "bwt_map.h"
35 #include "junctions.h"
36 #include "insertions.h"
37 #include "fragments.h"
38 #include "wiggles.h"
39 #include "tokenize.h"
40 #include "reads.h"
41
42 #include "inserts.h"
43
44 /**
45 * Add insertions from an alignment to an InsertionSet.
46 * This will look for insertion in the alignment specified by bh. If the
47 * insertion is already in insertions, it will updated the count. Otherwise,
48 * it will add the insertion to the set and initialize the count to 1.
49 * @param bh The bowtie hit to be used to specify alignment infromation.
50 * @param insertions The InsertionSet that will be updated with the insertion information from teh alignment.
51 */
52 void insertions_from_alignment(const BowtieHit& bh, InsertionSet& insertions) {
53 vector<pair<Insertion, InsertionStats> > new_insertions;
54 insertions_from_spliced_hit(bh, new_insertions);
55
56 for(size_t i = 0; i < new_insertions.size(); ++i) {
57 const pair<Insertion, InsertionStats>& insertion = new_insertions[i];
58 InsertionSet::iterator itr = insertions.find(insertion.first);
59 if (itr != insertions.end()) {
60 itr->second.supporting_hits += 1;
61 itr->second.left_extent = max(itr->second.left_extent, insertion.second.left_extent);
62 itr->second.right_extent = max(itr->second.right_extent, insertion.second.right_extent);
63 }
64 else {
65 assert(insertion.first.refid != VMAXINT32);
66 insertions[insertion.first] = insertion.second;
67 }
68 }
69 return;
70 }
71
72 /**
73 * Print insertions in BED format.
74 * Note, as per the BED-standard (http://genome.ucsc.edu/FAQ/FAQformat)
75 * -The coordinates should be 0-based
76 * -The chromEnd field should not include the actual feature
77 * -The name will be the inserted sequence
78 * -The score will be the number of supporing counts, which is capped at 1,000
79 * By (my) convention, the chromStart will be the last genome postion
80 * before hte insertio.
81 *
82 * <chrom>\t<left>\t<left>\t<inserted sequence>\t<read count>\n
83 * @param insertions_out The output file
84 * @param insertions Maps from insertions to number of supporting reads
85 * @param ref_sequences The table of reference sequences
86 */
87 void print_insertions(FILE* insertions_out, const InsertionSet& insertions, RefSequenceTable& ref_sequences) {
88 fprintf(insertions_out, "track name=insertions description=\"TopHat insertions\"\n");
89 for(InsertionSet::const_iterator i = insertions.begin(); i != insertions.end(); ++i){
90 int counts = i->second.supporting_hits;
91 if(counts > 1000){
92 counts = 1000;
93 }
94 fprintf(insertions_out, "%s\t%d\t%d\t%s\t%d\n",
95 ref_sequences.get_name(i->first.refid),
96 i->first.left,
97 i->first.left,
98 (i->first.sequence).c_str(),
99 counts);
100 }
101 }
102
103 /**
104 * Extract a list of insertions from a bowtie hit.
105 * Given a bowtie hit, extract a vector of insertions.
106 * @param bh The bowtie hit to use for alignment information.
107 * @param insertions Used to store the resultant vector of insertions.
108 */
109 void insertions_from_spliced_hit(const BowtieHit& bh, vector<pair<Insertion, InsertionStats> >& insertions){
110 const vector<CigarOp>& cigar = bh.cigar();
111 unsigned int positionInGenome = bh.left();
112 unsigned int positionInRead = 0;
113
114 bool bSawFusion = false;
115 for(size_t c = 0; c < cigar.size(); ++c){
116 switch(cigar[c].opcode){
117 case REF_SKIP:
118 positionInGenome += cigar[c].length;
119 break;
120 case rEF_SKIP:
121 positionInGenome -= cigar[c].length;
122 break;
123 case MATCH:
124 case mATCH:
125 if (cigar[c].opcode == MATCH)
126 positionInGenome += cigar[c].length;
127 else
128 positionInGenome -= cigar[c].length;
129 positionInRead += cigar[c].length;
130 break;
131 case DEL:
132 positionInGenome += cigar[c].length;
133 break;
134 case dEL:
135 positionInGenome -= cigar[c].length;
136 break;
137 case INS:
138 case iNS:
139 {
140 Insertion insertion;
141 InsertionStats stats;
142
143 /*
144 * Note that the reported position in the genome from the SAM
145 * alignment is 1-based, since the insertion object is expecting
146 * a 0-based co-ordinate, we need to subtract 1
147 */
148 if (bSawFusion)
149 insertion.refid = bh.ref_id2();
150 else
151 insertion.refid = bh.ref_id();
152
153 if (cigar[c].opcode == INS)
154 insertion.left = positionInGenome - 1;
155 else
156 insertion.left = positionInGenome + 1;
157
158 insertion.sequence = bh.seq().substr(positionInRead, cigar[c].length);
159 stats.supporting_hits = 1;
160 if (c > 0)
161 stats.left_extent = cigar[c-1].length;
162 if (c + 1 < cigar.size())
163 stats.right_extent = cigar[c+1].length;
164
165 insertions.push_back(make_pair(insertion, stats));
166 positionInRead += cigar[c].length;
167 }
168 break;
169 case FUSION_FF:
170 case FUSION_FR:
171 case FUSION_RF:
172 bSawFusion = true;
173 positionInGenome = cigar[c].length;
174 break;
175 default:
176 break;
177 }
178 }
179 return;
180 }
181
182 void merge_with(InsertionSet& insertions, const InsertionSet& other)
183 {
184 for (InsertionSet::const_iterator insertion = other.begin(); insertion != other.end(); ++insertion)
185 {
186 InsertionSet::iterator itr = insertions.find(insertion->first);
187 if (itr != insertions.end())
188 {
189 itr->second.merge_with(insertion->second);
190 }
191 else
192 {
193 insertions[insertion->first] = insertion->second;
194 }
195 }
196 }