ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/map2gtf.cpp
Revision: 236
Committed: Fri May 4 22:20:02 2012 UTC (12 years, 4 months ago) by gpertea
File size: 10545 byte(s)
Log Message:
sync, wip prep_reads

Line File contents
1 /*
2 * Author: Harold Pimentel
3 * Contact: http://cs.berkeley.edu/~pimentel
4 * Date: June 10, 2011
5 */
6 #include "map2gtf.h"
7 #include "tokenize.h"
8
9 void m2g_print_usage()
10 {
11 std::cerr << "Usage: map2gtf\tannotation.gtf "
12 << " alignments.bwtout out_file.sam" << std::endl;
13 }
14
15 Map2GTF::Map2GTF(const std::string& gtf_fname, const std::string& in_fname) :
16 gtf_fname_(gtf_fname), in_fname_(in_fname), refSeqTable_(true)
17 {
18 gtf_fhandle_ = fopen(gtf_fname_.c_str(), "r");
19 if (gtf_fhandle_ == NULL)
20 {
21 std::cerr << "FATAL: Couldn't open annotation: " << gtf_fname_
22 << std::endl;
23 exit(1);
24 }
25 std::cout << "Reading the annotation file: " << gtf_fname_ << std::endl;
26 gtfReader_.init(gtf_fhandle_, true); //only recognizable transcripts will be loaded
27 gtfReader_.readAll();
28
29 in_fhandle_=samopen(in_fname_.c_str(), "rb", 0);
30 if (in_fhandle_ == NULL)
31 {
32 std::cerr << "FATAL: Couldn't open input bam file: " << in_fname_
33 << std::endl;
34 exit(1);
35 }
36 in_sam_header_ = in_fhandle_->header;
37
38 std::cout << "Done with initializtion. " << std::endl;
39 }
40
41 Map2GTF::~Map2GTF()
42 {
43 std::cout << "map2gtf has completed. Cleaning up." << std::endl;
44 if (gtf_fhandle_ != NULL && fclose(gtf_fhandle_))
45 {
46 std::cerr << "Warning: Error closing annotation: " << gtf_fname_
47 << std::endl;
48 }
49
50 if (in_fhandle_ != NULL)
51 {
52 samclose(in_fhandle_);
53 }
54
55 std::cout << "Done. Thanks!" << std::endl;
56 }
57
58 //
59 bool Map2GTF::next_read_hits(vector<bam1_t*>& hits, size_t& num_hits)
60 {
61 long read_id = 0;
62 if (hits.size() > num_hits)
63 {
64 bam1_t* temp = hits[num_hits];
65 hits[num_hits] = hits.front();
66 hits.front() = temp;
67
68 num_hits = 1;
69 char* name = bam1_qname(hits.front());
70 read_id = atol(name);
71 }
72 else
73 num_hits = 0;
74
75 while (true)
76 {
77 bam1_t* hit = NULL;
78 if (num_hits >= hits.size())
79 hits.push_back(bam_init1());
80
81 hit = hits[num_hits];
82
83 if (samread(in_fhandle_, hit) <= 0)
84 {
85 for (size_t i = num_hits; i < hits.size(); ++i)
86 bam_destroy1(hits[i]);
87
88 hits.erase(hits.begin() + num_hits, hits.end());
89 break;
90 }
91
92 char* name = bam1_qname(hit);
93 long temp_read_id = atol(name);
94 if (num_hits == 0)
95 {
96 read_id = temp_read_id;
97 }
98 else if (read_id != temp_read_id)
99 {
100 break;
101 }
102
103 ++num_hits;
104 }
105
106 return num_hits > 0;
107 }
108
109 void Map2GTF::convert_coords(const std::string& out_fname, const std::string& sam_header)
110 {
111 samfile_t* out_sam_header_file = samopen(sam_header.c_str(), "r", 0);
112 if (out_sam_header_file == NULL)
113 std::cerr << "Error opening sam header: " << sam_header << std::endl;
114
115 out_sam_header_ = out_sam_header_file->header;
116
117 samfile_t* bam_writer = samopen(out_fname.c_str(), "wb", out_sam_header_);
118 if (bam_writer == NULL)
119 std::cerr << "Error opening bam file for writing: " << out_fname << std::endl;
120
121 ref_to_id_.clear();
122 for (int i = 0; i < out_sam_header_->n_targets; ++i)
123 {
124 ref_to_id_[out_sam_header_->target_name[i]] = i;
125 }
126
127 std::vector<TranscriptomeHit> read_list;
128 GffObj* p_trans = NULL;
129
130 HitsForRead hit_group;
131 std::vector<TranscriptomeHit>::iterator bh_it;
132 std::vector<TranscriptomeHit>::iterator bh_unique_it;
133 BowtieHit bwt_hit;
134
135 vector<bam1_t*> hits;
136 size_t num_hits = 0;
137 // a hit group is a set of reads with the same name
138 while (next_read_hits(hits, num_hits))
139 {
140 for (size_t i = 0; i < num_hits; ++i)
141 {
142 bam1_t* hit = hits[i];
143 const char* target_name = in_sam_header_->target_name[hit->core.tid];
144 size_t trans_idx = atoi(target_name);
145
146 p_trans = gtfReader_.gflst.Get(trans_idx);
147 TranscriptomeHit converted_out(hit, p_trans);
148 bool success = trans_to_genomic_coords(converted_out);
149
150 if (success)
151 read_list.push_back(converted_out);
152 }
153
154 // XXX: Fine for now... should come up with a more efficient way though
155 // FIXME: Take frag length into consideration when filtering
156 std::sort(read_list.begin(), read_list.end());
157 bh_unique_it = std::unique(read_list.begin(), read_list.end());
158 for (bh_it = read_list.begin(); bh_it != bh_unique_it; ++bh_it)
159 {
160 samwrite(bam_writer, bh_it->hit);
161 }
162 read_list.clear();
163 }
164
165 for (size_t i = 0; i < hits.size(); ++i)
166 {
167 bam_destroy1(hits[i]);
168 }
169 hits.clear();
170
171 samclose(bam_writer);
172 }
173
174 bool Map2GTF::trans_to_genomic_coords(TranscriptomeHit& hit)
175 //out.trans must already have the corresponding GffObj*
176 {
177 // read start in genomic coords
178 size_t read_start = 0;
179
180 GList<GffExon>& exon_list = hit.trans->exons;
181 GffExon* cur_exon;
182 GffExon* next_exon;
183 int cur_pos;
184 int match_length;
185 int miss_length;
186 int cur_intron_len = 0;
187 int i = 0;
188
189 static const int MAX_CIGARS = 1024;
190 int cigars[MAX_CIGARS];
191 int num_cigars = 0;
192
193 // TODO: Check this return value
194 bool ret_val = get_read_start(&exon_list, hit.hit->core.pos, read_start, i);
195 cur_pos = read_start;
196 for (int c = 0; c < hit.hit->core.n_cigar; ++c)
197 {
198 int opcode = bam1_cigar(hit.hit)[c] & BAM_CIGAR_MASK;
199 int length = bam1_cigar(hit.hit)[c] >> BAM_CIGAR_SHIFT;
200
201 if (opcode == BAM_CINS)
202 {
203 cigars[num_cigars] = opcode | (length << BAM_CIGAR_SHIFT);
204 ++num_cigars;
205 }
206
207 if (opcode != BAM_CMATCH && opcode != BAM_CDEL)
208 continue;
209
210 int remaining_length = length;
211 for (; i < exon_list.Count(); ++i)
212 {
213 cur_exon = exon_list.Get(i);
214 if (cur_pos >= (int)cur_exon->start &&
215 cur_pos + remaining_length - 1 <= (int)cur_exon->end) // read ends in this exon
216 {
217 cigars[num_cigars] = opcode | (remaining_length << BAM_CIGAR_SHIFT);
218 ++num_cigars;
219 cur_pos += remaining_length;
220 break;
221 }
222
223 // shouldn't need the check... can switch to a regular "else"
224 else if (cur_pos >= (int)cur_exon->start &&
225 cur_pos + remaining_length - 1 > (int)cur_exon->end)// read is spliced and overlaps this exon
226 {
227 // XXX: This should _never_ go out of range.
228 // get the max length that fits in this exon, go to next exon
229 // cur_pos should be the next exon start
230 // set assertion to check this
231
232 // TODO: check this
233 match_length = cur_exon->end - cur_pos + 1;
234 if (match_length > 0)
235 {
236 cigars[num_cigars] = opcode | (match_length << BAM_CIGAR_SHIFT);
237 ++num_cigars;
238 }
239
240 // XXX: DEBUG
241 if (i + 1 >= exon_list.Count())
242 {
243 std::cerr << "trying to access: " << i + 2 << " when size is: "
244 << exon_list.Count() << std::endl;
245 print_trans(hit.trans, hit.hit, remaining_length, match_length, cur_pos,
246 read_start);
247 return false;
248 }
249
250 else
251 next_exon = exon_list.Get(i + 1);
252
253 // and this
254 miss_length = next_exon->start - cur_exon->end - 1;
255 cur_intron_len += miss_length;
256
257 cigars[num_cigars] = BAM_CREF_SKIP | (miss_length << BAM_CIGAR_SHIFT);
258 ++num_cigars;
259
260 cur_pos += match_length + miss_length;
261 remaining_length -= match_length;
262 assert(cur_pos == next_exon->start);
263 }
264 }
265 }
266
267 hit.hit->core.tid = ref_to_id_[hit.trans->getRefName()];
268 hit.hit->core.pos = read_start - 1;
269 hit.hit->core.flag &= ~BAM_FSECONDARY;
270
271 int old_n_cigar = hit.hit->core.n_cigar;
272 if (num_cigars != old_n_cigar)
273 {
274 int data_len = hit.hit->data_len + 4 * (num_cigars - old_n_cigar);
275 int m_data = max(data_len, hit.hit->m_data);
276 kroundup32(m_data);
277
278 uint8_t* data = (uint8_t*)calloc(m_data, 1);
279
280 int copy1_len = (uint8_t*)bam1_cigar(hit.hit) - hit.hit->data;
281 memcpy(data, hit.hit->data, copy1_len);
282
283 int copy2_len = num_cigars * 4;
284 memcpy(data + copy1_len, cigars, copy2_len);
285
286 int copy3_len = hit.hit->data_len - copy1_len - (old_n_cigar * 4);
287 memcpy(data + copy1_len + copy2_len, bam1_seq(hit.hit), copy3_len);
288
289 hit.hit->core.n_cigar = num_cigars;
290
291 free(hit.hit->data);
292 hit.hit->data = data;
293 hit.hit->data_len = data_len;
294 hit.hit->m_data = m_data;
295 }
296
297 char strand = hit.trans->strand;
298 uint8_t* ptr = bam_aux_get(hit.hit, "XS");
299 if (ptr)
300 bam_aux_del(hit.hit, ptr);
301
302 if (strand == '+' || strand == '-')
303 bam_aux_append(hit.hit, "XS", 'A', 1, (uint8_t*)&strand);
304
305 return true;
306 }
307
308 void print_trans(GffObj* trans, const bam1_t* in, size_t rem_len,
309 size_t match_len, size_t cur_pos, size_t start_pos)
310 {
311 GffExon* p_exon;
312 std::cerr << "\tCur_pos: " << cur_pos << " remaining: " << rem_len
313 << " match_len: " << match_len << std::endl;
314 std::cerr << "\tTranscript:\t" << trans->start << "\t" << trans->end
315 << std::endl;
316 for (int i = 0; i < trans->exons.Count(); ++i)
317 {
318 p_exon = trans->exons.Get(i);
319 std::cerr << "\t\t" << p_exon->start << "\t" << p_exon->end
320 << std::endl;
321 }
322 std::cerr << std::endl;
323
324 std::cerr << "Read_id: " << bam1_qname(in) << std::endl;
325 std::cerr << "\tgff_start: " << in->core.pos << "\tgenome_start: "
326 << start_pos << std::endl;
327 }
328
329 // Returns false if not in this exon list
330 bool get_read_start(GList<GffExon>* exon_list, size_t gtf_start,
331 size_t& genome_start, int& exon_idx)
332 {
333 GffExon* cur_exon;
334 size_t cur_intron_dist = 0;
335 size_t trans_start = exon_list->First()->start;
336 int trans_offset = 0;
337 for (int i = 0; i < exon_list->Count(); ++i)
338 {
339 cur_exon = exon_list->Get(i);
340 trans_offset = trans_start + cur_intron_dist;
341
342 if (gtf_start >= cur_exon->start - trans_offset && gtf_start
343 <= cur_exon->end - trans_offset)
344 {
345 genome_start = gtf_start + trans_start + cur_intron_dist;
346 exon_idx = i;
347 return true;
348 }
349 else
350 {
351 if (i + 1 < exon_list->Count())
352 cur_intron_dist += exon_list->Get(i + 1)->start - cur_exon->end
353 - 1;
354 else
355 return false;
356 }
357 }
358
359 return false;
360 }
361
362 int main(int argc, char *argv[])
363 {
364 int parse_ret = parse_options(argc, argv, m2g_print_usage);
365 if (parse_ret)
366 return parse_ret;
367
368 if (optind >= argc)
369 {
370 m2g_print_usage();
371 return 1;
372 }
373
374 std::string gtf_file(argv[optind++]);
375 std::string in_fname(argv[optind++]);
376 std::string out_fname(argv[optind++]);
377
378 if (gtf_file == "" || in_fname == "" || out_fname == "")
379 {
380 m2g_print_usage();
381 exit(1);
382 }
383
384 Map2GTF gtfMapper(gtf_file, in_fname);
385 gtfMapper.convert_coords(out_fname, sam_header);
386
387 return 0;
388 }