ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/reads.cpp
Revision: 220
Committed: Thu Mar 22 19:33:00 2012 UTC (12 years, 5 months ago) by gpertea
File size: 18405 byte(s)
Log Message:
working, writing rejected.bam

Line User Rev File contents
1 gpertea 29 /*
2     * reads.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 9/2/48.
6     * Copyright 2448 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <iostream>
15     #include <cassert>
16     #include <string>
17     #include <algorithm>
18     #include <cstring>
19     #include <cstdlib>
20    
21     #include <seqan/find.h>
22     #include <seqan/file.h>
23     #include <seqan/modifier.h>
24    
25     #include "reads.h"
26     #include "bwt_map.h"
27     #include "tokenize.h"
28    
29     using namespace std;
30    
31     char* FLineReader::nextLine() {
32     if(!file) return NULL;
33     if (pushed) { pushed=false; return buf; }
34     //reads a char at a time until \n and/or \r are encountered
35     len=0;
36     int c=0;
37     while ((c=getc(file))!=EOF) {
38     if (len>=allocated-1) {
39     allocated+=512;
40     buf=(char*)realloc(buf,allocated);
41     }
42     if (c=='\n' || c=='\r') {
43     buf[len]='\0';
44     if (c=='\r') { //DOS file: double-char line terminator, skip the second one
45     if ((c=getc(file))!='\n')
46     ungetc(c,file);
47     }
48     lcount++;
49     return buf;
50     }
51     buf[len]=(char)c;
52     len++;
53     }
54     if (c==EOF) {
55     isEOF=true;
56     if (len==0) return NULL;
57     }
58     buf[len]='\0';
59     lcount++;
60     return buf;
61     }
62    
63     void skip_lines(FLineReader& fr)
64     {
65     if (fr.fhandle() == NULL) return;
66     char* buf = NULL;
67     while ((buf = fr.nextLine()) != NULL) {
68     if (buf[0] == '\0') continue;
69     if (buf[0] == '>' || buf[0] == '@')
70     {
71     fr.pushBack();
72     break;
73     }
74     }
75     }
76    
77     bool next_fasta_record(FLineReader& fr,
78     string& defline,
79     string& seq,
80     ReadFormat reads_format)
81    
82     {
83     seq.clear();
84     defline.clear();
85     char* buf=NULL;
86     while ((buf=fr.nextLine())!=NULL) {
87     if (buf[0]==0) continue; //skip empty lines
88     if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
89     if (seq.length()>0) { //current record ending
90     fr.pushBack();
91     return true;
92     }
93     defline=buf+1;
94     string::size_type space_pos = defline.find_first_of(" \t");
95     if (space_pos != string::npos) {
96     defline.resize(space_pos);
97     }
98     continue;
99     } //defline
100     // sequence line
101     seq.append(buf);
102     } //line reading loop
103    
104     replace(seq.begin(), seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
105     return !(seq.empty());
106     }
107    
108     bool next_fastq_record(FLineReader& fr,
109     const string& seq,
110     string& alt_name,
111     string& qual,
112     ReadFormat reads_format)
113     {
114     alt_name.clear();
115     qual.clear();
116     char* fline=fr.nextLine();
117     if (fline==NULL) return false;
118     while (fline[0]==0) { //skip empty lines
119     fline=fr.nextLine();
120     if (fline==NULL) return false;
121     }
122     //must be on '+' line here
123 gpertea 135 if (fline==NULL || (reads_format == FASTQ && fline[0] != '+') ||
124     (reads_format == FASTA && quals && fline[0] != '>')) {
125 gpertea 29 err_exit("Error: '+' not found for fastq record %s\n",fline);
126     return false;
127     }
128     alt_name=fline+1;
129     string::size_type space_pos = alt_name.find_first_of(" \t");
130     if (space_pos != string::npos) alt_name.resize(space_pos);
131     //read qv line(s) now:
132     while ((fline=fr.nextLine())!=NULL) {
133     if (integer_quals)
134     {
135 gpertea 135 vector<string> integer_qual_values;
136     tokenize(string(fline), " ", integer_qual_values);
137 gpertea 29
138 gpertea 135 string temp_qual;
139     for (size_t i = 0; i < integer_qual_values.size(); ++i)
140     {
141     int qual_value = atoi(integer_qual_values[i].c_str());
142     if (qual_value < 0) qual_value = 0;
143     temp_qual.push_back((char)(qual_value + 33));
144     }
145 gpertea 29
146 gpertea 135 qual.append(temp_qual);
147 gpertea 29 }
148     else
149     qual.append(fline);
150 gpertea 135 if (qual.length()>=seq.length()-1) break;
151 gpertea 29 }
152     // final check
153 gpertea 135 if (color) {
154     if (seq.length()==qual.length()) {
155     //discard first qv
156     qual=qual.substr(1);
157 gpertea 68 }
158 gpertea 135 if (seq.length()!=qual.length()+1) {
159     err_exit("Error: length of quality string does not match seq length (%d) for color read %s!\n",
160     seq.length(), alt_name.c_str());
161     }
162     }
163     else {
164     if (seq.length()!=qual.length()) {
165 gpertea 68 err_exit("Error: qual string length (%d) differs from seq length (%d) for read %s!\n",
166 gpertea 29 qual.length(), seq.length(), alt_name.c_str());
167 gpertea 135 //return false;
168 gpertea 29 }
169 gpertea 135 }
170 gpertea 29 //
171     return !(qual.empty());
172     }
173    
174     bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
175     FLineReader* frq) {
176 gpertea 207 /*
177 gpertea 154 if (fr.pushed_read)
178     {
179     read = fr.last_read;
180     fr.pushed_read = false;
181     return true;
182     }
183 gpertea 207 */
184 gpertea 29 read.clear();
185     char* buf=NULL;
186     while ((buf=fr.nextLine())!=NULL) {
187     if (buf[0]==0) continue; //skip empty lines
188     if ((reads_format == FASTA && buf[0] == '>') ||
189     (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
190     if (read.seq.length()>0) { //current record ending
191     fr.pushBack();
192     break;
193     }
194     read.name=buf+1;
195     string::size_type space_pos = read.name.find_first_of(" \t");
196     if (space_pos != string::npos) {
197     read.name.resize(space_pos);
198     }
199     continue;
200     } //defline
201     // sequence line
202     read.seq.append(buf);
203     } //line reading loop
204    
205     replace(read.seq.begin(), read.seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
206     if (reads_format != FASTQ && frq==NULL)
207     return (!read.seq.empty());
208     if (frq==NULL) frq=&fr; //FASTQ
209     //FASTQ or quals in a separate file -- now read quality values
210     buf=frq->nextLine();
211     if (buf==NULL) return false;
212     while (buf[0]==0) { //skip empty lines
213     buf=frq->nextLine();
214     if (buf==NULL) return false;
215     }
216     //must be on '+' line here
217     if (buf==NULL || (reads_format == FASTQ && buf[0] != '+') ||
218     (reads_format == FASTA && buf[0] != '>')) {
219     err_exit("Error: beginning of quality values record not found! (%s)\n",buf);
220     return false;
221     }
222     read.alt_name=buf+1;
223     string::size_type space_pos = read.alt_name.find_first_of(" \t");
224     if (space_pos != string::npos) read.alt_name.resize(space_pos);
225     //read qv line(s) now:
226     while ((buf=frq->nextLine())!=NULL) {
227     if (integer_quals)
228     {
229     vector<string> integer_qual_values;
230     tokenize(string(buf), " ", integer_qual_values);
231     string temp_qual;
232     for (size_t i = 0; i < integer_qual_values.size(); ++i)
233     {
234     int qual_value = atoi(integer_qual_values[i].c_str());
235     if (qual_value < 0) qual_value = 0;
236     temp_qual.push_back((char)(qual_value + 33));
237     }
238     read.qual.append(temp_qual);
239     }
240     else {
241 gpertea 135 read.qual.append(buf);
242 gpertea 29 }
243 gpertea 135 if (read.qual.length()>=read.seq.length()-1)
244     break;
245 gpertea 29 } //while qv lines
246    
247     // final check
248 gpertea 135 if (color) {
249     if (read.seq.length()==read.qual.length()) {
250     //discard first qv
251     read.qual=read.qual.substr(1);
252 gpertea 68 }
253 gpertea 135 if (read.seq.length()!=read.qual.length()+1) {
254     err_exit("Error: length of quality string does not match sequence length (%d) for color read %s!\n",
255     read.seq.length(), read.alt_name.c_str());
256     }
257     }
258     else {
259     if (read.seq.length()!=read.qual.length()) {
260 gpertea 29 err_exit("Error: qual length (%d) differs from seq length (%d) for fastq record %s!\n",
261     read.qual.length(), read.seq.length(), read.alt_name.c_str());
262     return false;
263     }
264 gpertea 135 }
265 gpertea 154
266 gpertea 207 //fr.last_read = read;
267 gpertea 135 return !(read.seq.empty());
268 gpertea 29 }
269    
270     // This could be faster.
271     void reverse_complement(string& seq)
272     {
273     //fprintf(stderr,"fwd: %s\n", seq.c_str());
274     for (string::size_type i = 0; i < seq.length(); ++i)
275     {
276     switch(seq[i])
277     {
278     case 'A' : seq[i] = 'T'; break;
279     case 'T' : seq[i] = 'A'; break;
280     case 'C' : seq[i] = 'G'; break;
281     case 'G' : seq[i] = 'C'; break;
282     default: seq[i] = 'N'; break;
283     }
284     }
285     reverse(seq.begin(), seq.end());
286     //fprintf(stderr, "rev: %s\n", seq.c_str());
287     }
288    
289     string convert_color_to_bp(const string& color)
290     {
291     if (color.length() <= 0)
292     return "";
293    
294     char base = color[0];
295     string bp;
296     for (string::size_type i = 1; i < color.length(); ++i)
297     {
298     char next = color[i];
299     switch(base)
300     {
301     // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
302     case 'A':
303     {
304     switch(next)
305     {
306     case '0': next = 'A'; break;
307     case '1': next = 'C'; break;
308     case '2': next = 'G'; break;
309     case '3': next = 'T'; break;
310     default: next = 'N'; break;
311     }
312     }
313     break;
314     case 'C':
315     {
316     // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
317     switch(next)
318     {
319     case '0': next = 'C'; break;
320     case '1': next = 'A'; break;
321     case '2': next = 'T'; break;
322     case '3': next = 'G'; break;
323     default: next = 'N'; break;
324     }
325     }
326     break;
327     case 'G':
328     {
329     // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
330     switch(next)
331     {
332     case '0': next = 'G'; break;
333     case '1': next = 'T'; break;
334     case '2': next = 'A'; break;
335     case '3': next = 'C'; break;
336     default: next = 'N'; break;
337     }
338     }
339     break;
340     case 'T':
341     {
342     // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
343     switch(next)
344     {
345     case '0': next = 'T'; break;
346     case '1': next = 'G'; break;
347     case '2': next = 'C'; break;
348     case '3': next = 'A'; break;
349     default: next = 'N'; break;
350     }
351     }
352     break;
353     default: next = 'N'; break;
354     }
355    
356     bp.push_back(next);
357     base = next;
358     }
359    
360     return bp;
361     }
362    
363     // daehwan - reduce code redundancy!
364     seqan::String<char> convert_color_to_bp(char base, const seqan::String<char>& color)
365     {
366     if (seqan::length(color) <= 0)
367     return "";
368    
369     string bp;
370     for (string::size_type i = 0; i < seqan::length(color); ++i)
371     {
372     char next = color[i];
373     switch(base)
374     {
375     // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
376     case 'A':
377     {
378     switch(next)
379     {
380     case '0': next = 'A'; break;
381     case '1': next = 'C'; break;
382     case '2': next = 'G'; break;
383     case '3': next = 'T'; break;
384     default: next = 'N'; break;
385     }
386     }
387     break;
388     case 'C':
389     {
390     // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
391     switch(next)
392     {
393     case '0': next = 'C'; break;
394     case '1': next = 'A'; break;
395     case '2': next = 'T'; break;
396     case '3': next = 'G'; break;
397     default: next = 'N'; break;
398     }
399     }
400     break;
401     case 'G':
402     {
403     // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
404     switch(next)
405     {
406     case '0': next = 'G'; break;
407     case '1': next = 'T'; break;
408     case '2': next = 'A'; break;
409     case '3': next = 'C'; break;
410     default: next = 'N'; break;
411     }
412     }
413     break;
414     case 'T':
415     {
416     // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
417     switch(next)
418     {
419     case '0': next = 'T'; break;
420     case '1': next = 'G'; break;
421     case '2': next = 'C'; break;
422     case '3': next = 'A'; break;
423     default: next = 'N'; break;
424     }
425     }
426     break;
427     default: next = 'N'; break;
428     }
429    
430     bp.push_back(next);
431     base = next;
432     }
433    
434     return bp;
435     }
436    
437    
438     #define check_color(b1, b2, c1, c2) ((b1 == c1 && b2 == c2) || (b1 == c2 && b2 == c1))
439    
440     #define two_bps_to_color(b1, b2, c) \
441     if (((b1) == 'A' || (b1) == 'G' || (b1) == 'C' || (b1) == 'T') && (b1) == (b2)) \
442     c = '0'; \
443     else if (check_color((b1), (b2), 'A', 'C') || check_color((b1), (b2), 'G', 'T')) \
444     c = '1'; \
445     else if (check_color((b1), (b2), 'A', 'G') || check_color((b1), (b2), 'C', 'T')) \
446     c = '2'; \
447     else if (check_color((b1), (b2), 'A', 'T') || check_color((b1), (b2), 'C', 'G')) \
448     c = '3'; \
449     else \
450     c = '4';
451    
452    
453     string convert_bp_to_color(const string& bp, bool remove_primer)
454     {
455     if (bp.length() <= 1)
456     return "";
457    
458     char base = toupper(bp[0]);
459     string color;
460     if (!remove_primer)
461     color.push_back(base);
462    
463     for (string::size_type i = 1; i < bp.length(); ++i)
464     {
465     char next = toupper(bp[i]);
466    
467     char c = '0';
468     two_bps_to_color(base, next, c);
469     color.push_back(c);
470    
471     base = next;
472     }
473    
474     return color;
475     }
476    
477     // daehwan - check this -
478     seqan::String<char> convert_bp_to_color(const seqan::String<char>& bp, bool remove_primer)
479     {
480     if (seqan::length(bp) <= 1)
481     return "";
482    
483     char base = toupper(bp[0]);
484     string color;
485     if (!remove_primer)
486     color.push_back(base);
487    
488     for (string::size_type i = 1; i < seqan::length(bp); ++i)
489     {
490     char next = toupper(bp[i]);
491    
492     char c = '0';
493     two_bps_to_color(base, next, c);
494     color.push_back(c);
495    
496     base = next;
497     }
498    
499     return color;
500     }
501    
502     /*
503     */
504     void BWA_decode(const string& color, const string& qual, const string& ref, string& decode)
505     {
506     assert(color.length() == ref.length() - 1);
507    
508 gpertea 200 static const size_t max_length = MAX_READ_LEN;
509 gpertea 29 const unsigned int max_value = max_length * 0xff;
510     size_t length = color.length();
511     if (length < 1 || length + 1 > max_length)
512     {
513     return;
514     }
515    
516     unsigned int f[max_length * 4];
517     char ptr[max_length * 4];
518    
519     unsigned int q_prev = 0;
520     for (unsigned int i = 0; i < length + 1; ++i)
521     {
522     unsigned int q = (unsigned int) (qual.length() <= i ? 'I' : qual[i]) - 33;
523     for (unsigned int j = 0; j < 4; ++j)
524     {
525     size_t i_j = i * 4 + j;
526     if (i == 0)
527     {
528     f[i_j] = "ACGT"[j] == ref[i] ? 0 : q;
529     ptr[i_j] = 4;
530     continue;
531     }
532    
533     f[i_j] = max_value;
534     char base = "ACGT"[j];
535     for (unsigned int k = 0; k < 4; ++k)
536     {
537     char base_prev = "ACGT"[k];
538     char ref_color;
539     two_bps_to_color(base_prev, base, ref_color);
540    
541     char base_prev_prev = "ACGTN"[ptr[(i-1)*4 + k]];
542     char ref_color_prev;
543     two_bps_to_color(base_prev_prev, base_prev, ref_color_prev);
544    
545     char color_curr = color[i-1];
546     char color_prev = i >= 2 ? color[i-2] : '4';
547    
548     int q_hat = 0;
549     if (color_prev == ref_color_prev && color_prev != '4')
550     {
551     if (color_curr == ref_color)
552     q_hat = q + q_prev;
553     else
554     q_hat = q_prev - q;
555     }
556     else if (color_curr == ref_color)
557     {
558     q_hat = q - q_prev;
559     }
560    
561     unsigned int f_k = f[(i-1) * 4 + k] +
562     (base == ref[i] ? 0 : q_hat) +
563     (color_curr == ref_color ? 0 : q);
564    
565     if (f_k < f[i_j])
566     {
567     f[i_j] = f_k;
568     ptr[i_j] = k;
569     }
570     }
571     }
572    
573     q_prev = q;
574     }
575    
576     unsigned int min_index = 0;
577     unsigned int min_f = f[length * 4];
578     for (unsigned int i = 1; i < 4; ++i)
579     {
580     unsigned int temp_f = f[length * 4 + i];
581     if (temp_f < min_f)
582     {
583     min_f = temp_f;
584     min_index = i;
585     }
586     }
587    
588     decode.resize(length + 1);
589     decode[length] = "ACGT"[min_index];
590     for (unsigned int i = length; i > 0; --i)
591     {
592     min_index = ptr[i * 4 + min_index];
593     decode[i-1] = "ACGT"[min_index];
594     }
595     }
596    
597 gpertea 135
598 gpertea 215 void bam2Read(bam1_t *b, Read& rd, bool alt_name=false) {
599 gpertea 207 GBamRecord bamrec(b);
600     rd.clear();
601 gpertea 210 rd.seq=bamrec.seqData(&rd.qual);
602     rd.name=bam1_qname(b);
603 gpertea 215 if (alt_name)
604 gpertea 217 rd.alt_name=bamrec.tag_str("ZN");
605 gpertea 207 }
606    
607    
608 gpertea 219 bool ReadStream::next_read(QReadData& rdata, ReadFormat read_format) {
609 gpertea 207 while (read_pq.size()<ReadBufSize && !r_eof) {
610 gpertea 135 //keep the queue topped off
611     Read rf;
612 gpertea 219 if (get_direct(rf, read_format)) {
613     uint64_t id = (uint64_t)atol(rf.name.c_str());
614     QReadData rdata(id, rf, last_b());
615     read_pq.push(rdata);
616 gpertea 210 }
617     }
618 gpertea 135 if (read_pq.size()==0)
619     return false;
620 gpertea 219 const QReadData& t = read_pq.top();
621     rdata=t; //copy strings and duplicate b pointer!
622 gpertea 135 read_pq.pop();
623     return true;
624     }
625    
626 gpertea 207 bool ReadStream::get_direct(Read& r, ReadFormat read_format) {
627 gpertea 211 if (fstream.file==NULL) return false;
628 gpertea 219 if (fstream.is_bam) {
629     bool got_read=false;
630     while (!got_read) {
631     if (samread(fstream.bam_file, b) < 0) {
632     r_eof=true;
633     return false;
634 gpertea 207 }
635 gpertea 219 else {
636     if (bam_ignoreQC || (b->core.flag & BAM_FQCFAIL)==0)
637     got_read=true;
638     }
639     }
640 gpertea 215 bam2Read(b, r, bam_alt_name);
641     return true;
642 gpertea 219 }
643 gpertea 207 if (!next_fastx_read(*flseqs, r, read_format, flquals)) {
644     r_eof=true;
645     return false;
646     }
647     return true;
648     }
649    
650 gpertea 217 // reads must ALWAYS be requested in increasing order of their ID
651 gpertea 135 bool ReadStream::getRead(uint64_t r_id,
652 gpertea 154 Read& read,
653     ReadFormat read_format,
654     bool strip_slash,
655 gpertea 159 uint64_t begin_id,
656     uint64_t end_id,
657 gpertea 219 GBamWriter* um_out, //unmapped reads output
658 gpertea 159 bool um_write_found //write the found ones
659     ) {
660 gpertea 135 if (!fstream.file)
661     err_die("Error: calling ReadStream::getRead() with no file handle!");
662     if (r_id<last_id)
663     err_die("Error: ReadStream::getRead() called with out-of-order id#!");
664     last_id=r_id;
665     bool found=false;
666 gpertea 219 read.clear();
667 gpertea 135 while (!found) {
668 gpertea 219 QReadData rdata;
669     if (!next_read(rdata, read_format))
670 gpertea 135 break;
671 gpertea 219 /*
672 gpertea 135 if (strip_slash) {
673 gpertea 219 string::size_type slash = rdata.read.name.rfind("/");
674 gpertea 135 if (slash != string::npos)
675 gpertea 219 rdata.read.name.resize(slash);
676 gpertea 135 }
677 gpertea 154 uint64_t id = (uint64_t)atol(read.name.c_str());
678 gpertea 219 */
679     if (rdata.id >= end_id)
680 gpertea 154 return false;
681 gpertea 135
682 gpertea 219 if (rdata.id < begin_id)
683 gpertea 154 continue;
684 gpertea 135
685 gpertea 219 if (rdata.id == r_id)
686 gpertea 135 {
687 gpertea 219 read=rdata.read;
688 gpertea 163 found=true;
689 gpertea 135 }
690 gpertea 219 else if (rdata.id > r_id)
691 gpertea 163 { //can't find it, went too far
692     //only happens when reads [mates] were removed for some reason
693 gpertea 219 //read_pq.push(make_pair(id, read));
694     read_pq.push(rdata);
695 gpertea 159 break;
696 gpertea 135 }
697 gpertea 163 if (um_out && ((um_write_found && found) ||
698     (!um_write_found && !found))) {
699 gpertea 219 //write unmapped reads
700     //fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(),
701     // read.seq.c_str(), read.qual.c_str());
702     string rname(rdata.read.alt_name);
703    
704     GBamRecord bamrec(rname.c_str(), -1, 0, false, rdata.read.seq.c_str(),
705     NULL, rdata.read.qual.c_str());
706 gpertea 220 if (rdata.matenum) {
707     bamrec.set_flag(BAM_FPAIRED);
708     if (rdata.matenum==1) bamrec.set_flag(BAM_FREAD1);
709 gpertea 219 else bamrec.set_flag(BAM_FREAD2);
710     }
711     if (rdata.trashCode) {
712     if (rdata.trashCode!='M') {
713     //multi-mapped reads did not really QC-fail
714     bamrec.set_flag(BAM_FQCFAIL);
715     }
716     bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&rdata.trashCode);
717     }
718     um_out->write(&bamrec);
719 gpertea 135 }
720     } //while reads
721     return found;
722     }