ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/reads.cpp
Revision: 217
Committed: Wed Mar 14 19:58:18 2012 UTC (12 years, 5 months ago) by gpertea
File size: 17764 byte(s)
Log Message:
Line File contents
1 /*
2 * reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/48.
6 * Copyright 2448 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <cassert>
16 #include <string>
17 #include <algorithm>
18 #include <cstring>
19 #include <cstdlib>
20
21 #include <seqan/find.h>
22 #include <seqan/file.h>
23 #include <seqan/modifier.h>
24
25 #include "reads.h"
26 #include "bwt_map.h"
27 #include "tokenize.h"
28
29 using namespace std;
30
31 char* FLineReader::nextLine() {
32 if(!file) return NULL;
33 if (pushed) { pushed=false; return buf; }
34 //reads a char at a time until \n and/or \r are encountered
35 len=0;
36 int c=0;
37 while ((c=getc(file))!=EOF) {
38 if (len>=allocated-1) {
39 allocated+=512;
40 buf=(char*)realloc(buf,allocated);
41 }
42 if (c=='\n' || c=='\r') {
43 buf[len]='\0';
44 if (c=='\r') { //DOS file: double-char line terminator, skip the second one
45 if ((c=getc(file))!='\n')
46 ungetc(c,file);
47 }
48 lcount++;
49 return buf;
50 }
51 buf[len]=(char)c;
52 len++;
53 }
54 if (c==EOF) {
55 isEOF=true;
56 if (len==0) return NULL;
57 }
58 buf[len]='\0';
59 lcount++;
60 return buf;
61 }
62
63 void skip_lines(FLineReader& fr)
64 {
65 if (fr.fhandle() == NULL) return;
66 char* buf = NULL;
67 while ((buf = fr.nextLine()) != NULL) {
68 if (buf[0] == '\0') continue;
69 if (buf[0] == '>' || buf[0] == '@')
70 {
71 fr.pushBack();
72 break;
73 }
74 }
75 }
76
77 bool next_fasta_record(FLineReader& fr,
78 string& defline,
79 string& seq,
80 ReadFormat reads_format)
81
82 {
83 seq.clear();
84 defline.clear();
85 char* buf=NULL;
86 while ((buf=fr.nextLine())!=NULL) {
87 if (buf[0]==0) continue; //skip empty lines
88 if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
89 if (seq.length()>0) { //current record ending
90 fr.pushBack();
91 return true;
92 }
93 defline=buf+1;
94 string::size_type space_pos = defline.find_first_of(" \t");
95 if (space_pos != string::npos) {
96 defline.resize(space_pos);
97 }
98 continue;
99 } //defline
100 // sequence line
101 seq.append(buf);
102 } //line reading loop
103
104 replace(seq.begin(), seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
105 return !(seq.empty());
106 }
107
108 bool next_fastq_record(FLineReader& fr,
109 const string& seq,
110 string& alt_name,
111 string& qual,
112 ReadFormat reads_format)
113 {
114 alt_name.clear();
115 qual.clear();
116 char* fline=fr.nextLine();
117 if (fline==NULL) return false;
118 while (fline[0]==0) { //skip empty lines
119 fline=fr.nextLine();
120 if (fline==NULL) return false;
121 }
122 //must be on '+' line here
123 if (fline==NULL || (reads_format == FASTQ && fline[0] != '+') ||
124 (reads_format == FASTA && quals && fline[0] != '>')) {
125 err_exit("Error: '+' not found for fastq record %s\n",fline);
126 return false;
127 }
128 alt_name=fline+1;
129 string::size_type space_pos = alt_name.find_first_of(" \t");
130 if (space_pos != string::npos) alt_name.resize(space_pos);
131 //read qv line(s) now:
132 while ((fline=fr.nextLine())!=NULL) {
133 if (integer_quals)
134 {
135 vector<string> integer_qual_values;
136 tokenize(string(fline), " ", integer_qual_values);
137
138 string temp_qual;
139 for (size_t i = 0; i < integer_qual_values.size(); ++i)
140 {
141 int qual_value = atoi(integer_qual_values[i].c_str());
142 if (qual_value < 0) qual_value = 0;
143 temp_qual.push_back((char)(qual_value + 33));
144 }
145
146 qual.append(temp_qual);
147 }
148 else
149 qual.append(fline);
150 if (qual.length()>=seq.length()-1) break;
151 }
152 // final check
153 if (color) {
154 if (seq.length()==qual.length()) {
155 //discard first qv
156 qual=qual.substr(1);
157 }
158 if (seq.length()!=qual.length()+1) {
159 err_exit("Error: length of quality string does not match seq length (%d) for color read %s!\n",
160 seq.length(), alt_name.c_str());
161 }
162 }
163 else {
164 if (seq.length()!=qual.length()) {
165 err_exit("Error: qual string length (%d) differs from seq length (%d) for read %s!\n",
166 qual.length(), seq.length(), alt_name.c_str());
167 //return false;
168 }
169 }
170 //
171 return !(qual.empty());
172 }
173
174 bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
175 FLineReader* frq) {
176 /*
177 if (fr.pushed_read)
178 {
179 read = fr.last_read;
180 fr.pushed_read = false;
181 return true;
182 }
183 */
184 read.clear();
185 char* buf=NULL;
186 while ((buf=fr.nextLine())!=NULL) {
187 if (buf[0]==0) continue; //skip empty lines
188 if ((reads_format == FASTA && buf[0] == '>') ||
189 (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
190 if (read.seq.length()>0) { //current record ending
191 fr.pushBack();
192 break;
193 }
194 read.name=buf+1;
195 string::size_type space_pos = read.name.find_first_of(" \t");
196 if (space_pos != string::npos) {
197 read.name.resize(space_pos);
198 }
199 continue;
200 } //defline
201 // sequence line
202 read.seq.append(buf);
203 } //line reading loop
204
205 replace(read.seq.begin(), read.seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
206 if (reads_format != FASTQ && frq==NULL)
207 return (!read.seq.empty());
208 if (frq==NULL) frq=&fr; //FASTQ
209 //FASTQ or quals in a separate file -- now read quality values
210 buf=frq->nextLine();
211 if (buf==NULL) return false;
212 while (buf[0]==0) { //skip empty lines
213 buf=frq->nextLine();
214 if (buf==NULL) return false;
215 }
216 //must be on '+' line here
217 if (buf==NULL || (reads_format == FASTQ && buf[0] != '+') ||
218 (reads_format == FASTA && buf[0] != '>')) {
219 err_exit("Error: beginning of quality values record not found! (%s)\n",buf);
220 return false;
221 }
222 read.alt_name=buf+1;
223 string::size_type space_pos = read.alt_name.find_first_of(" \t");
224 if (space_pos != string::npos) read.alt_name.resize(space_pos);
225 //read qv line(s) now:
226 while ((buf=frq->nextLine())!=NULL) {
227 if (integer_quals)
228 {
229 vector<string> integer_qual_values;
230 tokenize(string(buf), " ", integer_qual_values);
231 string temp_qual;
232 for (size_t i = 0; i < integer_qual_values.size(); ++i)
233 {
234 int qual_value = atoi(integer_qual_values[i].c_str());
235 if (qual_value < 0) qual_value = 0;
236 temp_qual.push_back((char)(qual_value + 33));
237 }
238 read.qual.append(temp_qual);
239 }
240 else {
241 read.qual.append(buf);
242 }
243 if (read.qual.length()>=read.seq.length()-1)
244 break;
245 } //while qv lines
246
247 // final check
248 if (color) {
249 if (read.seq.length()==read.qual.length()) {
250 //discard first qv
251 read.qual=read.qual.substr(1);
252 }
253 if (read.seq.length()!=read.qual.length()+1) {
254 err_exit("Error: length of quality string does not match sequence length (%d) for color read %s!\n",
255 read.seq.length(), read.alt_name.c_str());
256 }
257 }
258 else {
259 if (read.seq.length()!=read.qual.length()) {
260 err_exit("Error: qual length (%d) differs from seq length (%d) for fastq record %s!\n",
261 read.qual.length(), read.seq.length(), read.alt_name.c_str());
262 return false;
263 }
264 }
265
266 //fr.last_read = read;
267 return !(read.seq.empty());
268 }
269
270 // This could be faster.
271 void reverse_complement(string& seq)
272 {
273 //fprintf(stderr,"fwd: %s\n", seq.c_str());
274 for (string::size_type i = 0; i < seq.length(); ++i)
275 {
276 switch(seq[i])
277 {
278 case 'A' : seq[i] = 'T'; break;
279 case 'T' : seq[i] = 'A'; break;
280 case 'C' : seq[i] = 'G'; break;
281 case 'G' : seq[i] = 'C'; break;
282 default: seq[i] = 'N'; break;
283 }
284 }
285 reverse(seq.begin(), seq.end());
286 //fprintf(stderr, "rev: %s\n", seq.c_str());
287 }
288
289 string convert_color_to_bp(const string& color)
290 {
291 if (color.length() <= 0)
292 return "";
293
294 char base = color[0];
295 string bp;
296 for (string::size_type i = 1; i < color.length(); ++i)
297 {
298 char next = color[i];
299 switch(base)
300 {
301 // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
302 case 'A':
303 {
304 switch(next)
305 {
306 case '0': next = 'A'; break;
307 case '1': next = 'C'; break;
308 case '2': next = 'G'; break;
309 case '3': next = 'T'; break;
310 default: next = 'N'; break;
311 }
312 }
313 break;
314 case 'C':
315 {
316 // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
317 switch(next)
318 {
319 case '0': next = 'C'; break;
320 case '1': next = 'A'; break;
321 case '2': next = 'T'; break;
322 case '3': next = 'G'; break;
323 default: next = 'N'; break;
324 }
325 }
326 break;
327 case 'G':
328 {
329 // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
330 switch(next)
331 {
332 case '0': next = 'G'; break;
333 case '1': next = 'T'; break;
334 case '2': next = 'A'; break;
335 case '3': next = 'C'; break;
336 default: next = 'N'; break;
337 }
338 }
339 break;
340 case 'T':
341 {
342 // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
343 switch(next)
344 {
345 case '0': next = 'T'; break;
346 case '1': next = 'G'; break;
347 case '2': next = 'C'; break;
348 case '3': next = 'A'; break;
349 default: next = 'N'; break;
350 }
351 }
352 break;
353 default: next = 'N'; break;
354 }
355
356 bp.push_back(next);
357 base = next;
358 }
359
360 return bp;
361 }
362
363 // daehwan - reduce code redundancy!
364 seqan::String<char> convert_color_to_bp(char base, const seqan::String<char>& color)
365 {
366 if (seqan::length(color) <= 0)
367 return "";
368
369 string bp;
370 for (string::size_type i = 0; i < seqan::length(color); ++i)
371 {
372 char next = color[i];
373 switch(base)
374 {
375 // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
376 case 'A':
377 {
378 switch(next)
379 {
380 case '0': next = 'A'; break;
381 case '1': next = 'C'; break;
382 case '2': next = 'G'; break;
383 case '3': next = 'T'; break;
384 default: next = 'N'; break;
385 }
386 }
387 break;
388 case 'C':
389 {
390 // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
391 switch(next)
392 {
393 case '0': next = 'C'; break;
394 case '1': next = 'A'; break;
395 case '2': next = 'T'; break;
396 case '3': next = 'G'; break;
397 default: next = 'N'; break;
398 }
399 }
400 break;
401 case 'G':
402 {
403 // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
404 switch(next)
405 {
406 case '0': next = 'G'; break;
407 case '1': next = 'T'; break;
408 case '2': next = 'A'; break;
409 case '3': next = 'C'; break;
410 default: next = 'N'; break;
411 }
412 }
413 break;
414 case 'T':
415 {
416 // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
417 switch(next)
418 {
419 case '0': next = 'T'; break;
420 case '1': next = 'G'; break;
421 case '2': next = 'C'; break;
422 case '3': next = 'A'; break;
423 default: next = 'N'; break;
424 }
425 }
426 break;
427 default: next = 'N'; break;
428 }
429
430 bp.push_back(next);
431 base = next;
432 }
433
434 return bp;
435 }
436
437
438 #define check_color(b1, b2, c1, c2) ((b1 == c1 && b2 == c2) || (b1 == c2 && b2 == c1))
439
440 #define two_bps_to_color(b1, b2, c) \
441 if (((b1) == 'A' || (b1) == 'G' || (b1) == 'C' || (b1) == 'T') && (b1) == (b2)) \
442 c = '0'; \
443 else if (check_color((b1), (b2), 'A', 'C') || check_color((b1), (b2), 'G', 'T')) \
444 c = '1'; \
445 else if (check_color((b1), (b2), 'A', 'G') || check_color((b1), (b2), 'C', 'T')) \
446 c = '2'; \
447 else if (check_color((b1), (b2), 'A', 'T') || check_color((b1), (b2), 'C', 'G')) \
448 c = '3'; \
449 else \
450 c = '4';
451
452
453 string convert_bp_to_color(const string& bp, bool remove_primer)
454 {
455 if (bp.length() <= 1)
456 return "";
457
458 char base = toupper(bp[0]);
459 string color;
460 if (!remove_primer)
461 color.push_back(base);
462
463 for (string::size_type i = 1; i < bp.length(); ++i)
464 {
465 char next = toupper(bp[i]);
466
467 char c = '0';
468 two_bps_to_color(base, next, c);
469 color.push_back(c);
470
471 base = next;
472 }
473
474 return color;
475 }
476
477 // daehwan - check this -
478 seqan::String<char> convert_bp_to_color(const seqan::String<char>& bp, bool remove_primer)
479 {
480 if (seqan::length(bp) <= 1)
481 return "";
482
483 char base = toupper(bp[0]);
484 string color;
485 if (!remove_primer)
486 color.push_back(base);
487
488 for (string::size_type i = 1; i < seqan::length(bp); ++i)
489 {
490 char next = toupper(bp[i]);
491
492 char c = '0';
493 two_bps_to_color(base, next, c);
494 color.push_back(c);
495
496 base = next;
497 }
498
499 return color;
500 }
501
502 /*
503 */
504 void BWA_decode(const string& color, const string& qual, const string& ref, string& decode)
505 {
506 assert(color.length() == ref.length() - 1);
507
508 static const size_t max_length = MAX_READ_LEN;
509 const unsigned int max_value = max_length * 0xff;
510 size_t length = color.length();
511 if (length < 1 || length + 1 > max_length)
512 {
513 return;
514 }
515
516 unsigned int f[max_length * 4];
517 char ptr[max_length * 4];
518
519 unsigned int q_prev = 0;
520 for (unsigned int i = 0; i < length + 1; ++i)
521 {
522 unsigned int q = (unsigned int) (qual.length() <= i ? 'I' : qual[i]) - 33;
523 for (unsigned int j = 0; j < 4; ++j)
524 {
525 size_t i_j = i * 4 + j;
526 if (i == 0)
527 {
528 f[i_j] = "ACGT"[j] == ref[i] ? 0 : q;
529 ptr[i_j] = 4;
530 continue;
531 }
532
533 f[i_j] = max_value;
534 char base = "ACGT"[j];
535 for (unsigned int k = 0; k < 4; ++k)
536 {
537 char base_prev = "ACGT"[k];
538 char ref_color;
539 two_bps_to_color(base_prev, base, ref_color);
540
541 char base_prev_prev = "ACGTN"[ptr[(i-1)*4 + k]];
542 char ref_color_prev;
543 two_bps_to_color(base_prev_prev, base_prev, ref_color_prev);
544
545 char color_curr = color[i-1];
546 char color_prev = i >= 2 ? color[i-2] : '4';
547
548 int q_hat = 0;
549 if (color_prev == ref_color_prev && color_prev != '4')
550 {
551 if (color_curr == ref_color)
552 q_hat = q + q_prev;
553 else
554 q_hat = q_prev - q;
555 }
556 else if (color_curr == ref_color)
557 {
558 q_hat = q - q_prev;
559 }
560
561 unsigned int f_k = f[(i-1) * 4 + k] +
562 (base == ref[i] ? 0 : q_hat) +
563 (color_curr == ref_color ? 0 : q);
564
565 if (f_k < f[i_j])
566 {
567 f[i_j] = f_k;
568 ptr[i_j] = k;
569 }
570 }
571 }
572
573 q_prev = q;
574 }
575
576 unsigned int min_index = 0;
577 unsigned int min_f = f[length * 4];
578 for (unsigned int i = 1; i < 4; ++i)
579 {
580 unsigned int temp_f = f[length * 4 + i];
581 if (temp_f < min_f)
582 {
583 min_f = temp_f;
584 min_index = i;
585 }
586 }
587
588 decode.resize(length + 1);
589 decode[length] = "ACGT"[min_index];
590 for (unsigned int i = length; i > 0; --i)
591 {
592 min_index = ptr[i * 4 + min_index];
593 decode[i-1] = "ACGT"[min_index];
594 }
595 }
596
597
598 void bam2Read(bam1_t *b, Read& rd, bool alt_name=false) {
599 GBamRecord bamrec(b);
600 rd.clear();
601 rd.seq=bamrec.seqData(&rd.qual);
602 rd.name=bam1_qname(b);
603 if (alt_name)
604 rd.alt_name=bamrec.tag_str("ZN");
605 }
606
607
608 bool ReadStream::next_read(Read& r, ReadFormat read_format) {
609 while (read_pq.size()<ReadBufSize && !r_eof) {
610 //keep the queue topped off
611 Read rf;
612 if (fstream.is_bam) {
613 if (samread(fstream.bam_file, b) < 0) {
614 r_eof=true;
615 break;
616 }
617 bam2Read(b, rf, bam_alt_name);
618 }
619 else {
620 if (!next_fastx_read(*flseqs, rf, read_format, flquals)) {
621 r_eof=true;
622 break;
623 }
624 }
625 uint64_t id = (uint64_t)atol(rf.name.c_str());
626 read_pq.push(make_pair(id, rf));
627 }
628 if (read_pq.size()==0)
629 return false;
630 const pair<uint64_t, Read>& t = read_pq.top();
631 r=t.second; //copy strings
632 //free(t.second);
633 read_pq.pop();
634 return true;
635 }
636
637 bool ReadStream::get_direct(Read& r, ReadFormat read_format) {
638 if (fstream.file==NULL) return false;
639 if (fstream.is_bam) {
640 if (samread(fstream.bam_file, b) < 0) {
641 r_eof=true;
642 return false;
643 }
644 bam2Read(b, r, bam_alt_name);
645 return true;
646 }
647 if (!next_fastx_read(*flseqs, r, read_format, flquals)) {
648 r_eof=true;
649 return false;
650 }
651 return true;
652 }
653
654 // reads must ALWAYS be requested in increasing order of their ID
655 bool ReadStream::getRead(uint64_t r_id,
656 Read& read,
657 ReadFormat read_format,
658 bool strip_slash,
659 uint64_t begin_id,
660 uint64_t end_id,
661 FILE* um_out, //unmapped reads output
662 bool um_write_found //write the found ones
663 ) {
664 if (!fstream.file)
665 err_die("Error: calling ReadStream::getRead() with no file handle!");
666 if (r_id<last_id)
667 err_die("Error: ReadStream::getRead() called with out-of-order id#!");
668 last_id=r_id;
669 bool found=false;
670 while (!found) {
671 read.clear();
672 // Get the next read from the file
673 if (!next_read(read, read_format))
674 break;
675
676 if (strip_slash) {
677 string::size_type slash = read.name.rfind("/");
678 if (slash != string::npos)
679 read.name.resize(slash);
680 }
681 uint64_t id = (uint64_t)atol(read.name.c_str());
682 if (id >= end_id)
683 return false;
684
685 if (id < begin_id)
686 continue;
687
688 if (id == r_id)
689 {
690 found=true;
691 }
692 else if (id > r_id)
693 { //can't find it, went too far
694 //only happens when reads [mates] were removed for some reason
695 read_pq.push(make_pair(id, read));
696 break;
697 }
698 if (um_out && ((um_write_found && found) ||
699 (!um_write_found && !found))) {
700 //write unmapped reads
701 fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(),
702 read.seq.c_str(), read.qual.c_str());
703 }
704 } //while reads
705 return found;
706 }