ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/map2gtf.cpp
Revision: 259
Committed: Fri Jun 29 19:01:59 2012 UTC (12 years, 2 months ago) by gpertea
File size: 10448 byte(s)
Log Message:
updated to v2.0.4

Line File contents
1 /*
2 * Author: Harold Pimentel
3 * Contact: http://cs.berkeley.edu/~pimentel
4 * Date: June 10, 2011
5 */
6 #include "map2gtf.h"
7 #include "tokenize.h"
8
9 void m2g_print_usage()
10 {
11 std::cerr << "Usage: map2gtf\tannotation.gtf "
12 << " alignments.bam out_file.bam" << std::endl;
13 }
14
15 Map2GTF::Map2GTF(const std::string& gtf_fname, const std::string& in_fname) :
16 gtf_fname_(gtf_fname), in_fname_(in_fname), refSeqTable_(true)
17 {
18 gtf_fhandle_ = fopen(gtf_fname_.c_str(), "r");
19 if (gtf_fhandle_ == NULL)
20 {
21 std::cerr << "FATAL: Couldn't open annotation: " << gtf_fname_
22 << std::endl;
23 exit(1);
24 }
25 std::cout << "Reading the annotation file: " << gtf_fname_ << std::endl;
26 gtfReader_.init(gtf_fhandle_, true); //only recognizable transcripts will be loaded
27 gtfReader_.readAll();
28 if (in_fname_=="-") {
29 in_fhandle_=samopen(in_fname_.c_str(), "rbu", 0);
30 }
31 else {
32 in_fhandle_=samopen(in_fname_.c_str(), "rb", 0);
33 }
34 if (in_fhandle_ == NULL)
35 {
36 std::cerr << "FATAL: Couldn't open input bam file: " << in_fname_
37 << std::endl;
38 exit(1);
39 }
40 in_sam_header_ = in_fhandle_->header;
41
42 std::cout << "Done with initializtion. " << std::endl;
43 }
44
45 Map2GTF::~Map2GTF()
46 {
47 std::cout << "map2gtf has completed. Cleaning up." << std::endl;
48 if (gtf_fhandle_ != NULL && fclose(gtf_fhandle_))
49 {
50 std::cerr << "Warning: Error closing annotation: " << gtf_fname_
51 << std::endl;
52 }
53
54 if (in_fhandle_ != NULL)
55 {
56 samclose(in_fhandle_);
57 }
58
59 std::cout << "Done. Thanks!" << std::endl;
60 }
61
62 //
63 bool Map2GTF::next_read_hits(vector<bam1_t*>& hits, size_t& num_hits, long& read_id)
64 {
65 if (hits.size() > num_hits)
66 {
67 bam1_t* temp = hits[num_hits];
68 hits[num_hits] = hits.front();
69 hits.front() = temp;
70
71 num_hits = 1;
72 char* name = bam1_qname(hits.front());
73 read_id = atol(name);
74 }
75 else
76 num_hits = 0;
77
78 while (true)
79 {
80 bam1_t* hit = NULL;
81 if (num_hits >= hits.size())
82 hits.push_back(bam_init1());
83
84 hit = hits[num_hits];
85
86 if (samread(in_fhandle_, hit) <= 0)
87 {
88 for (size_t i = num_hits; i < hits.size(); ++i)
89 bam_destroy1(hits[i]);
90
91 hits.erase(hits.begin() + num_hits, hits.end());
92 break;
93 }
94
95 char* name = bam1_qname(hit);
96 long temp_read_id = atol(name);
97 if (num_hits == 0)
98 {
99 read_id = temp_read_id;
100 }
101 else if (read_id != temp_read_id)
102 {
103 break;
104 }
105
106 ++num_hits;
107 }
108
109 return num_hits > 0;
110 }
111
112 void Map2GTF::convert_coords(const std::string& out_fname, const std::string& sam_header)
113 {
114 samfile_t* out_sam_header_file = samopen(sam_header.c_str(), "r", 0);
115 if (out_sam_header_file == NULL)
116 std::cerr << "Error opening sam header: " << sam_header << std::endl;
117
118 out_sam_header_ = out_sam_header_file->header;
119
120 string index_out_fname = out_fname + ".index";
121 GBamWriter bam_writer(out_fname.c_str(), out_sam_header_, index_out_fname);
122 ref_to_id_.clear();
123 for (int i = 0; i < out_sam_header_->n_targets; ++i)
124 {
125 ref_to_id_[out_sam_header_->target_name[i]] = i;
126 }
127
128 std::vector<TranscriptomeHit> read_list;
129 GffObj* p_trans = NULL;
130
131 HitsForRead hit_group;
132 std::vector<TranscriptomeHit>::iterator bh_it;
133 std::vector<TranscriptomeHit>::iterator bh_unique_it;
134 BowtieHit bwt_hit;
135
136 vector<bam1_t*> hits;
137 size_t num_hits = 0;
138 long read_id = 0;
139 // a hit group is a set of reads with the same name
140 while (next_read_hits(hits, num_hits, read_id))
141 {
142 for (size_t i = 0; i < num_hits; ++i)
143 {
144 bam1_t* hit = hits[i];
145 const char* target_name = in_sam_header_->target_name[hit->core.tid];
146 size_t trans_idx = atoi(target_name);
147
148 p_trans = gtfReader_.gflst.Get(trans_idx);
149 TranscriptomeHit converted_out(hit, p_trans);
150 bool success = trans_to_genomic_coords(converted_out);
151
152 if (success)
153 read_list.push_back(converted_out);
154 }
155
156 // XXX: Fine for now... should come up with a more efficient way though
157 // FIXME: Take frag length into consideration when filtering
158 std::sort(read_list.begin(), read_list.end());
159 bh_unique_it = std::unique(read_list.begin(), read_list.end());
160 for (bh_it = read_list.begin(); bh_it != bh_unique_it; ++bh_it)
161 {
162 bam_writer.write(bh_it->hit, read_id);
163 }
164 read_list.clear();
165 }
166
167 for (size_t i = 0; i < hits.size(); ++i)
168 {
169 bam_destroy1(hits[i]);
170 }
171 hits.clear();
172 }
173
174 bool Map2GTF::trans_to_genomic_coords(TranscriptomeHit& hit)
175 //out.trans must already have the corresponding GffObj*
176 {
177 // read start in genomic coords
178 size_t read_start = 0;
179
180 GList<GffExon>& exon_list = hit.trans->exons;
181 GffExon* cur_exon;
182 GffExon* next_exon;
183 int cur_pos;
184 int match_length;
185 int miss_length;
186 int cur_intron_len = 0;
187 int i = 0;
188
189 static const int MAX_CIGARS = 1024;
190 int cigars[MAX_CIGARS];
191 int num_cigars = 0;
192
193 // TODO: Check this return value
194 bool ret_val = get_read_start(&exon_list, hit.hit->core.pos, read_start, i);
195 if (!ret_val)
196 {
197 }
198
199 cur_pos = read_start;
200 for (int c = 0; c < hit.hit->core.n_cigar; ++c)
201 {
202 int opcode = bam1_cigar(hit.hit)[c] & BAM_CIGAR_MASK;
203 int length = bam1_cigar(hit.hit)[c] >> BAM_CIGAR_SHIFT;
204
205 if (opcode == BAM_CINS)
206 {
207 cigars[num_cigars] = opcode | (length << BAM_CIGAR_SHIFT);
208 ++num_cigars;
209 }
210
211 if (opcode != BAM_CMATCH && opcode != BAM_CDEL)
212 continue;
213
214 int remaining_length = length;
215 for (; i < exon_list.Count(); ++i)
216 {
217 cur_exon = exon_list.Get(i);
218 if (cur_pos >= (int)cur_exon->start &&
219 cur_pos + remaining_length - 1 <= (int)cur_exon->end) // read ends in this exon
220 {
221 cigars[num_cigars] = opcode | (remaining_length << BAM_CIGAR_SHIFT);
222 ++num_cigars;
223 cur_pos += remaining_length;
224 break;
225 }
226
227 // shouldn't need the check... can switch to a regular "else"
228 else if (cur_pos >= (int)cur_exon->start &&
229 cur_pos + remaining_length - 1 > (int)cur_exon->end)// read is spliced and overlaps this exon
230 {
231 // XXX: This should _never_ go out of range.
232 // get the max length that fits in this exon, go to next exon
233 // cur_pos should be the next exon start
234 // set assertion to check this
235
236 // TODO: check this
237 match_length = cur_exon->end - cur_pos + 1;
238 if (match_length > 0)
239 {
240 cigars[num_cigars] = opcode | (match_length << BAM_CIGAR_SHIFT);
241 ++num_cigars;
242 }
243
244 // XXX: DEBUG
245 if (i + 1 >= exon_list.Count())
246 {
247 std::cerr << "trying to access: " << i + 2 << " when size is: "
248 << exon_list.Count() << std::endl;
249 print_trans(hit.trans, hit.hit, remaining_length, match_length, cur_pos,
250 read_start);
251 return false;
252 }
253
254 else
255 next_exon = exon_list.Get(i + 1);
256
257 // and this
258 miss_length = next_exon->start - cur_exon->end - 1;
259 cur_intron_len += miss_length;
260
261 cigars[num_cigars] = BAM_CREF_SKIP | (miss_length << BAM_CIGAR_SHIFT);
262 ++num_cigars;
263
264 cur_pos += match_length + miss_length;
265 remaining_length -= match_length;
266 assert(cur_pos == (int)next_exon->start);
267 }
268 }
269 }
270
271 hit.hit->core.tid = ref_to_id_[hit.trans->getRefName()];
272 hit.hit->core.pos = read_start - 1;
273 hit.hit->core.flag &= ~BAM_FSECONDARY;
274
275 int old_n_cigar = hit.hit->core.n_cigar;
276 if (num_cigars != old_n_cigar)
277 {
278 int data_len = hit.hit->data_len + 4 * (num_cigars - old_n_cigar);
279 int m_data = max(data_len, hit.hit->m_data);
280 kroundup32(m_data);
281
282 uint8_t* data = (uint8_t*)calloc(m_data, 1);
283
284 int copy1_len = (uint8_t*)bam1_cigar(hit.hit) - hit.hit->data;
285 memcpy(data, hit.hit->data, copy1_len);
286
287 int copy2_len = num_cigars * 4;
288 memcpy(data + copy1_len, cigars, copy2_len);
289
290 int copy3_len = hit.hit->data_len - copy1_len - (old_n_cigar * 4);
291 memcpy(data + copy1_len + copy2_len, bam1_seq(hit.hit), copy3_len);
292
293 hit.hit->core.n_cigar = num_cigars;
294
295 free(hit.hit->data);
296 hit.hit->data = data;
297 hit.hit->data_len = data_len;
298 hit.hit->m_data = m_data;
299 }
300
301 char strand = hit.trans->strand;
302 uint8_t* ptr = bam_aux_get(hit.hit, "XS");
303 if (ptr)
304 bam_aux_del(hit.hit, ptr);
305
306 if (strand == '+' || strand == '-')
307 bam_aux_append(hit.hit, "XS", 'A', 1, (uint8_t*)&strand);
308
309 return true;
310 }
311
312 void print_trans(GffObj* trans, const bam1_t* in, size_t rem_len,
313 size_t match_len, size_t cur_pos, size_t start_pos)
314 {
315 GffExon* p_exon;
316 std::cerr << "\tCur_pos: " << cur_pos << " remaining: " << rem_len
317 << " match_len: " << match_len << std::endl;
318 std::cerr << "\tTranscript:\t" << trans->start << "\t" << trans->end
319 << std::endl;
320 for (int i = 0; i < trans->exons.Count(); ++i)
321 {
322 p_exon = trans->exons.Get(i);
323 std::cerr << "\t\t" << p_exon->start << "\t" << p_exon->end
324 << std::endl;
325 }
326 std::cerr << std::endl;
327
328 std::cerr << "Read_id: " << bam1_qname(in) << std::endl;
329 std::cerr << "\tgff_start: " << in->core.pos << "\tgenome_start: "
330 << start_pos << std::endl;
331 }
332
333 // Returns false if not in this exon list
334 bool get_read_start(GList<GffExon>* exon_list, size_t gtf_start,
335 size_t& genome_start, int& exon_idx)
336 {
337 GffExon* cur_exon;
338 size_t cur_intron_dist = 0;
339 size_t trans_start = exon_list->First()->start;
340 int trans_offset = 0;
341 for (int i = 0; i < exon_list->Count(); ++i)
342 {
343 cur_exon = exon_list->Get(i);
344 trans_offset = trans_start + cur_intron_dist;
345
346 if (gtf_start >= cur_exon->start - trans_offset && gtf_start
347 <= cur_exon->end - trans_offset)
348 {
349 genome_start = gtf_start + trans_start + cur_intron_dist;
350 exon_idx = i;
351 return true;
352 }
353 else
354 {
355 if (i + 1 < exon_list->Count())
356 cur_intron_dist += exon_list->Get(i + 1)->start - cur_exon->end
357 - 1;
358 else
359 return false;
360 }
361 }
362
363 return false;
364 }
365
366 int main(int argc, char *argv[])
367 {
368 int parse_ret = parse_options(argc, argv, m2g_print_usage);
369 if (parse_ret)
370 return parse_ret;
371
372 if (optind >= argc)
373 {
374 m2g_print_usage();
375 return 1;
376 }
377
378 std::string gtf_file(argv[optind++]);
379 std::string in_fname(argv[optind++]);
380 std::string out_fname(argv[optind++]);
381
382 if (gtf_file == "" || in_fname == "" || out_fname == "")
383 {
384 m2g_print_usage();
385 exit(1);
386 }
387
388 Map2GTF gtfMapper(gtf_file, in_fname);
389 gtfMapper.convert_coords(out_fname, sam_header);
390
391 return 0;
392 }