ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/reads.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (12 years, 8 months ago) by gpertea
File size: 18715 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 /*
2 * reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/48.
6 * Copyright 2448 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <cassert>
16 #include <string>
17 #include <algorithm>
18 #include <cstring>
19 #include <cstdlib>
20
21 #include <seqan/find.h>
22 #include <seqan/file.h>
23 #include <seqan/modifier.h>
24
25 #include "reads.h"
26 #include "bwt_map.h"
27 #include "tokenize.h"
28
29 using namespace std;
30
31 char* FLineReader::nextLine() {
32 if(!file) return NULL;
33 if (pushed) { pushed=false; return buf; }
34 //reads a char at a time until \n and/or \r are encountered
35 len=0;
36 int c=0;
37 while ((c=getc(file))!=EOF) {
38 if (len>=allocated-1) {
39 allocated+=512;
40 buf=(char*)realloc(buf,allocated);
41 }
42 if (c=='\n' || c=='\r') {
43 buf[len]='\0';
44 if (c=='\r') { //DOS file: double-char line terminator, skip the second one
45 if ((c=getc(file))!='\n')
46 ungetc(c,file);
47 }
48 lcount++;
49 return buf;
50 }
51 buf[len]=(char)c;
52 len++;
53 }
54 if (c==EOF) {
55 isEOF=true;
56 if (len==0) return NULL;
57 }
58 buf[len]='\0';
59 lcount++;
60 return buf;
61 }
62
63 void skip_lines(FLineReader& fr)
64 {
65 if (fr.fhandle() == NULL) return;
66 char* buf = NULL;
67 while ((buf = fr.nextLine()) != NULL) {
68 if (buf[0] == '\0') continue;
69 if (buf[0] == '>' || buf[0] == '@')
70 {
71 fr.pushBack();
72 break;
73 }
74 }
75 }
76
77 bool next_fasta_record(FLineReader& fr,
78 string& defline,
79 string& seq,
80 ReadFormat reads_format)
81
82 {
83 seq.clear();
84 defline.clear();
85 char* buf=NULL;
86 while ((buf=fr.nextLine())!=NULL) {
87 if (buf[0]==0) continue; //skip empty lines
88 if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
89 if (seq.length()>0) { //current record ending
90 fr.pushBack();
91 return true;
92 }
93 defline=buf+1;
94 string::size_type space_pos = defline.find_first_of(" \t");
95 if (space_pos != string::npos) {
96 defline.resize(space_pos);
97 }
98 continue;
99 } //defline
100 // sequence line
101 seq.append(buf);
102 } //line reading loop
103
104 replace(seq.begin(), seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
105 return !(seq.empty());
106 }
107
108 bool next_fastq_record(FLineReader& fr,
109 const string& seq,
110 string& alt_name,
111 string& qual,
112 ReadFormat reads_format)
113 {
114 alt_name.clear();
115 qual.clear();
116 char* fline=fr.nextLine();
117 if (fline==NULL) return false;
118 while (fline[0]==0) { //skip empty lines
119 fline=fr.nextLine();
120 if (fline==NULL) return false;
121 }
122 //must be on '+' line here
123 if (fline==NULL || (reads_format == FASTQ && fline[0] != '+') ||
124 (reads_format == FASTA && quals && fline[0] != '>')) {
125 err_exit("Error: '+' not found for fastq record %s\n",fline);
126 return false;
127 }
128 alt_name=fline+1;
129 string::size_type space_pos = alt_name.find_first_of(" \t");
130 if (space_pos != string::npos) alt_name.resize(space_pos);
131 //read qv line(s) now:
132 while ((fline=fr.nextLine())!=NULL) {
133 if (integer_quals)
134 {
135 vector<string> integer_qual_values;
136 tokenize(string(fline), " ", integer_qual_values);
137
138 string temp_qual;
139 for (size_t i = 0; i < integer_qual_values.size(); ++i)
140 {
141 int qual_value = atoi(integer_qual_values[i].c_str());
142 if (qual_value < 0) qual_value = 0;
143 temp_qual.push_back((char)(qual_value + 33));
144 }
145
146 qual.append(temp_qual);
147 }
148 else
149 qual.append(fline);
150 if (qual.length()>=seq.length()-1) break;
151 }
152 // final check
153 if (color) {
154 if (seq.length()==qual.length()) {
155 //discard first qv
156 qual=qual.substr(1);
157 }
158 if (seq.length()!=qual.length()+1) {
159 err_exit("Error: length of quality string does not match seq length (%d) for color read %s!\n",
160 seq.length(), alt_name.c_str());
161 }
162 }
163 else {
164 if (seq.length()!=qual.length()) {
165 err_exit("Error: qual string length (%d) differs from seq length (%d) for read %s!\n",
166 qual.length(), seq.length(), alt_name.c_str());
167 //return false;
168 }
169 }
170 //
171 return !(qual.empty());
172 }
173
174 bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
175 FLineReader* frq) {
176 read.clear();
177 char* buf=NULL;
178 while ((buf=fr.nextLine())!=NULL) {
179 if (buf[0]==0) continue; //skip empty lines
180 if ((reads_format == FASTA && buf[0] == '>') ||
181 (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
182 if (read.seq.length()>0) { //current record ending
183 fr.pushBack();
184 break;
185 }
186 read.name=buf+1;
187 string::size_type space_pos = read.name.find_first_of(" \t");
188 if (space_pos != string::npos) {
189 read.name.resize(space_pos);
190 }
191 continue;
192 } //defline
193 // sequence line
194 read.seq.append(buf);
195 } //line reading loop
196
197 replace(read.seq.begin(), read.seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
198 if (reads_format != FASTQ && frq==NULL)
199 return (!read.seq.empty());
200 if (frq==NULL) frq=&fr; //FASTQ
201 //FASTQ or quals in a separate file -- now read quality values
202 buf=frq->nextLine();
203 if (buf==NULL) return false;
204 while (buf[0]==0) { //skip empty lines
205 buf=frq->nextLine();
206 if (buf==NULL) return false;
207 }
208 //must be on '+' line here
209 if (buf==NULL || (reads_format == FASTQ && buf[0] != '+') ||
210 (reads_format == FASTA && buf[0] != '>')) {
211 err_exit("Error: beginning of quality values record not found! (%s)\n",buf);
212 return false;
213 }
214 read.alt_name=buf+1;
215 string::size_type space_pos = read.alt_name.find_first_of(" \t");
216 if (space_pos != string::npos) read.alt_name.resize(space_pos);
217 //read qv line(s) now:
218 while ((buf=frq->nextLine())!=NULL) {
219 if (integer_quals)
220 {
221 vector<string> integer_qual_values;
222 tokenize(string(buf), " ", integer_qual_values);
223 string temp_qual;
224 for (size_t i = 0; i < integer_qual_values.size(); ++i)
225 {
226 int qual_value = atoi(integer_qual_values[i].c_str());
227 if (qual_value < 0) qual_value = 0;
228 temp_qual.push_back((char)(qual_value + 33));
229 }
230 read.qual.append(temp_qual);
231 }
232 else {
233 read.qual.append(buf);
234 }
235 if (read.qual.length()>=read.seq.length()-1)
236 break;
237 } //while qv lines
238
239 // final check
240 if (color) {
241 if (read.seq.length()==read.qual.length()) {
242 //discard first qv
243 read.qual=read.qual.substr(1);
244 }
245 if (read.seq.length()!=read.qual.length()+1) {
246 err_exit("Error: length of quality string does not match sequence length (%d) for color read %s!\n",
247 read.seq.length(), read.alt_name.c_str());
248 }
249 }
250 else {
251 if (read.seq.length()!=read.qual.length()) {
252 err_exit("Error: qual length (%d) differs from seq length (%d) for fastq record %s!\n",
253 read.qual.length(), read.seq.length(), read.alt_name.c_str());
254 return false;
255 }
256 }
257 return !(read.seq.empty());
258 }
259
260 // This could be faster.
261 void reverse_complement(string& seq)
262 {
263 //fprintf(stderr,"fwd: %s\n", seq.c_str());
264 for (string::size_type i = 0; i < seq.length(); ++i)
265 {
266 switch(seq[i])
267 {
268 case 'A' : seq[i] = 'T'; break;
269 case 'T' : seq[i] = 'A'; break;
270 case 'C' : seq[i] = 'G'; break;
271 case 'G' : seq[i] = 'C'; break;
272 default: seq[i] = 'N'; break;
273 }
274 }
275 reverse(seq.begin(), seq.end());
276 //fprintf(stderr, "rev: %s\n", seq.c_str());
277 }
278
279 string convert_color_to_bp(const string& color)
280 {
281 if (color.length() <= 0)
282 return "";
283
284 char base = color[0];
285 string bp;
286 for (string::size_type i = 1; i < color.length(); ++i)
287 {
288 char next = color[i];
289 switch(base)
290 {
291 // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
292 case 'A':
293 {
294 switch(next)
295 {
296 case '0': next = 'A'; break;
297 case '1': next = 'C'; break;
298 case '2': next = 'G'; break;
299 case '3': next = 'T'; break;
300 default: next = 'N'; break;
301 }
302 }
303 break;
304 case 'C':
305 {
306 // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
307 switch(next)
308 {
309 case '0': next = 'C'; break;
310 case '1': next = 'A'; break;
311 case '2': next = 'T'; break;
312 case '3': next = 'G'; break;
313 default: next = 'N'; break;
314 }
315 }
316 break;
317 case 'G':
318 {
319 // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
320 switch(next)
321 {
322 case '0': next = 'G'; break;
323 case '1': next = 'T'; break;
324 case '2': next = 'A'; break;
325 case '3': next = 'C'; break;
326 default: next = 'N'; break;
327 }
328 }
329 break;
330 case 'T':
331 {
332 // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
333 switch(next)
334 {
335 case '0': next = 'T'; break;
336 case '1': next = 'G'; break;
337 case '2': next = 'C'; break;
338 case '3': next = 'A'; break;
339 default: next = 'N'; break;
340 }
341 }
342 break;
343 default: next = 'N'; break;
344 }
345
346 bp.push_back(next);
347 base = next;
348 }
349
350 return bp;
351 }
352
353 // daehwan - reduce code redundancy!
354 seqan::String<char> convert_color_to_bp(char base, const seqan::String<char>& color)
355 {
356 if (seqan::length(color) <= 0)
357 return "";
358
359 string bp;
360 for (string::size_type i = 0; i < seqan::length(color); ++i)
361 {
362 char next = color[i];
363 switch(base)
364 {
365 // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
366 case 'A':
367 {
368 switch(next)
369 {
370 case '0': next = 'A'; break;
371 case '1': next = 'C'; break;
372 case '2': next = 'G'; break;
373 case '3': next = 'T'; break;
374 default: next = 'N'; break;
375 }
376 }
377 break;
378 case 'C':
379 {
380 // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
381 switch(next)
382 {
383 case '0': next = 'C'; break;
384 case '1': next = 'A'; break;
385 case '2': next = 'T'; break;
386 case '3': next = 'G'; break;
387 default: next = 'N'; break;
388 }
389 }
390 break;
391 case 'G':
392 {
393 // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
394 switch(next)
395 {
396 case '0': next = 'G'; break;
397 case '1': next = 'T'; break;
398 case '2': next = 'A'; break;
399 case '3': next = 'C'; break;
400 default: next = 'N'; break;
401 }
402 }
403 break;
404 case 'T':
405 {
406 // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
407 switch(next)
408 {
409 case '0': next = 'T'; break;
410 case '1': next = 'G'; break;
411 case '2': next = 'C'; break;
412 case '3': next = 'A'; break;
413 default: next = 'N'; break;
414 }
415 }
416 break;
417 default: next = 'N'; break;
418 }
419
420 bp.push_back(next);
421 base = next;
422 }
423
424 return bp;
425 }
426
427
428 #define check_color(b1, b2, c1, c2) ((b1 == c1 && b2 == c2) || (b1 == c2 && b2 == c1))
429
430 #define two_bps_to_color(b1, b2, c) \
431 if (((b1) == 'A' || (b1) == 'G' || (b1) == 'C' || (b1) == 'T') && (b1) == (b2)) \
432 c = '0'; \
433 else if (check_color((b1), (b2), 'A', 'C') || check_color((b1), (b2), 'G', 'T')) \
434 c = '1'; \
435 else if (check_color((b1), (b2), 'A', 'G') || check_color((b1), (b2), 'C', 'T')) \
436 c = '2'; \
437 else if (check_color((b1), (b2), 'A', 'T') || check_color((b1), (b2), 'C', 'G')) \
438 c = '3'; \
439 else \
440 c = '4';
441
442
443 string convert_bp_to_color(const string& bp, bool remove_primer)
444 {
445 if (bp.length() <= 1)
446 return "";
447
448 char base = toupper(bp[0]);
449 string color;
450 if (!remove_primer)
451 color.push_back(base);
452
453 for (string::size_type i = 1; i < bp.length(); ++i)
454 {
455 char next = toupper(bp[i]);
456
457 char c = '0';
458 two_bps_to_color(base, next, c);
459 color.push_back(c);
460
461 base = next;
462 }
463
464 return color;
465 }
466
467 // daehwan - check this -
468 seqan::String<char> convert_bp_to_color(const seqan::String<char>& bp, bool remove_primer)
469 {
470 if (seqan::length(bp) <= 1)
471 return "";
472
473 char base = toupper(bp[0]);
474 string color;
475 if (!remove_primer)
476 color.push_back(base);
477
478 for (string::size_type i = 1; i < seqan::length(bp); ++i)
479 {
480 char next = toupper(bp[i]);
481
482 char c = '0';
483 two_bps_to_color(base, next, c);
484 color.push_back(c);
485
486 base = next;
487 }
488
489 return color;
490 }
491
492 /*
493 */
494 void BWA_decode(const string& color, const string& qual, const string& ref, string& decode)
495 {
496 assert(color.length() == ref.length() - 1);
497
498 const size_t max_length = 256;
499 const unsigned int max_value = max_length * 0xff;
500 size_t length = color.length();
501 if (length < 1 || length + 1 > max_length)
502 {
503 return;
504 }
505
506 unsigned int f[max_length * 4];
507 char ptr[max_length * 4];
508
509 unsigned int q_prev = 0;
510 for (unsigned int i = 0; i < length + 1; ++i)
511 {
512 unsigned int q = (unsigned int) (qual.length() <= i ? 'I' : qual[i]) - 33;
513 for (unsigned int j = 0; j < 4; ++j)
514 {
515 size_t i_j = i * 4 + j;
516 if (i == 0)
517 {
518 f[i_j] = "ACGT"[j] == ref[i] ? 0 : q;
519 ptr[i_j] = 4;
520 continue;
521 }
522
523 f[i_j] = max_value;
524 char base = "ACGT"[j];
525 for (unsigned int k = 0; k < 4; ++k)
526 {
527 char base_prev = "ACGT"[k];
528 char ref_color;
529 two_bps_to_color(base_prev, base, ref_color);
530
531 char base_prev_prev = "ACGTN"[ptr[(i-1)*4 + k]];
532 char ref_color_prev;
533 two_bps_to_color(base_prev_prev, base_prev, ref_color_prev);
534
535 char color_curr = color[i-1];
536 char color_prev = i >= 2 ? color[i-2] : '4';
537
538 int q_hat = 0;
539 if (color_prev == ref_color_prev && color_prev != '4')
540 {
541 if (color_curr == ref_color)
542 q_hat = q + q_prev;
543 else
544 q_hat = q_prev - q;
545 }
546 else if (color_curr == ref_color)
547 {
548 q_hat = q - q_prev;
549 }
550
551 unsigned int f_k = f[(i-1) * 4 + k] +
552 (base == ref[i] ? 0 : q_hat) +
553 (color_curr == ref_color ? 0 : q);
554
555 if (f_k < f[i_j])
556 {
557 f[i_j] = f_k;
558 ptr[i_j] = k;
559 }
560 }
561 }
562
563 q_prev = q;
564 }
565
566 unsigned int min_index = 0;
567 unsigned int min_f = f[length * 4];
568 for (unsigned int i = 1; i < 4; ++i)
569 {
570 unsigned int temp_f = f[length * 4 + i];
571 if (temp_f < min_f)
572 {
573 min_f = temp_f;
574 min_index = i;
575 }
576 }
577
578 decode.resize(length + 1);
579 decode[length] = "ACGT"[min_index];
580 for (unsigned int i = length; i > 0; --i)
581 {
582 min_index = ptr[i * 4 + min_index];
583 decode[i-1] = "ACGT"[min_index];
584 }
585 }
586
587
588 bool ReadStream::next_read(Read& r, ReadFormat read_format) {
589 FLineReader fr(fstream.file);
590 while (read_pq.size()<100000 && !r_eof) {
591 //keep the queue topped off
592 Read rf;
593 if (!next_fastx_read(fr, rf, read_format)) {
594 r_eof=true;
595 break;
596 }
597 //Read read=Read(rf);
598 uint64_t id = (uint64_t)atol(rf.name.c_str());
599 read_pq.push(make_pair(id, rf));
600 }
601 if (read_pq.size()==0)
602 return false;
603 const pair<uint64_t, Read>& t = read_pq.top();
604 r=t.second; //copy strings
605 //free(t.second);
606 read_pq.pop();
607 return true;
608 }
609
610 // reads must ALWAYS requested in increasing order of their ID
611 bool ReadStream::getRead(uint64_t r_id,
612 Read& read,
613 ReadFormat read_format,
614 bool strip_slash,
615 FILE* um_out, //unmapped reads output
616 bool um_write_found) {
617 if (!fstream.file)
618 err_die("Error: calling ReadStream::getRead() with no file handle!");
619 if (r_id<last_id)
620 err_die("Error: ReadStream::getRead() called with out-of-order id#!");
621 last_id=r_id;
622 bool found=false;
623 while (!found) {
624 read.clear();
625 // Get the next read from the file
626 if (!next_read(read, read_format))
627 break;
628 if (strip_slash) {
629 string::size_type slash = read.name.rfind("/");
630 if (slash != string::npos)
631 read.name.resize(slash);
632 }
633 if ((uint64_t)atoi(read.name.c_str()) == r_id) {
634 found=true;
635 }
636 if (um_out && (um_write_found || !found)) {
637 //write unmapped reads
638 fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(),
639 read.seq.c_str(), read.qual.c_str());
640 }
641 //rt.get_id(read.name, ref_str);
642 } //while reads
643 return found;
644 }
645
646
647 bool get_read_from_stream(uint64_t insert_id,
648 FILE* reads_file,
649 ReadFormat reads_format,
650 bool strip_slash,
651 Read& read,
652 FILE* um_out,
653 bool um_write_found
654 ) {
655 FLineReader fr(reads_file);
656 bool found=false;
657 while(!found && !fr.isEof())
658 {
659 read.clear();
660 // Get the next read from the file
661 if (!next_fastx_read(fr, read, reads_format))
662 break;
663 if (strip_slash)
664 {
665 string::size_type slash = read.name.rfind("/");
666 if (slash != string::npos)
667 read.name.resize(slash);
668 }
669 if ((uint64_t)atoi(read.name.c_str()) == insert_id)
670 {
671 //return true;
672 found=true;
673 }
674 if (um_out && (um_write_found || !found)) {
675 //write unmapped reads
676 fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(),
677 read.seq.c_str(), read.qual.c_str());
678 }
679 //rt.get_id(read.name, ref_str);
680 } //while reads
681 return found;
682 }
683
684
685 bool get_read_from_stream(uint64_t insert_id,
686 FILE* reads_file,
687 ReadFormat reads_format,
688 bool strip_slash,
689 char read_name [],
690 char read_seq [],
691 char read_alt_name [],
692 char read_qual [],
693 FILE* um_out)
694 {
695 Read read;
696 FLineReader fr(reads_file);
697 while(!fr.isEof())
698 {
699 read.clear();
700
701 // Get the next read from the file
702 if (!next_fastx_read(fr, read, reads_format))
703 break;
704
705 if (strip_slash)
706 {
707 string::size_type slash = read.name.rfind("/");
708 if (slash != string::npos)
709 read.name.resize(slash);
710 }
711
712 if ((uint64_t)atoi(read.name.c_str()) == insert_id)
713 {
714 if (read_name) strcpy(read_name, read.name.c_str());
715 if (read_seq) strcpy(read_seq, read.seq.c_str());
716 if (read_alt_name) strcpy(read_alt_name, read.alt_name.c_str());
717 if (read_qual) strcpy(read_qual, read.qual.c_str());
718 return true;
719 }
720 else if (um_out!=NULL) {
721 //write unmapped reads
722 fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(), read.seq.c_str(),
723 read.qual.c_str());
724 }
725
726 //rt.get_id(read.name, ref_str);
727 } //while reads
728
729 return false;
730 }