ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 244
Committed: Wed May 16 20:38:28 2012 UTC (12 years, 3 months ago) by gpertea
File size: 18450 byte(s)
Log Message:
seems to work, before sync

Line File contents
1 /*
2 * prep_reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 * Derived from maq "catfilter", by Heng Li at Sanger
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <stdio.h>
15 #include <cassert>
16 #include <vector>
17 #include <cstring>
18 #include <cstdlib>
19
20 #include "common.h"
21 #include "reads.h"
22 #include "tokenize.h"
23 #include "qual.h"
24
25 using namespace std;
26
27 void format_qual_string(string& qual_str)
28 {
29 for (size_t i = 0; i < qual_str.size(); ++i)
30 {
31 qual_str[i] = charToPhred33(qual_str[i],
32 solexa_quals,
33 phred64_quals);
34 }
35 }
36
37 vector<string> flt_reads_fnames;
38 bool readmap_loaded=false;
39 vector<bool> readmap; //for filter_multihits
40 vector<bool> mate_readmap; //for filter_multihits
41
42 void load_readmap(string& flt_fname, vector<bool>& rmap) {
43 //readmap_loaded=false;
44 if (flt_fname.empty()) return;
45 ReadStream rdstream(flt_fname, NULL, true);
46 Read read;
47 while (rdstream.get_direct(read, reads_format)) {
48 uint32_t rnum=(uint32_t)atol(read.name.c_str());
49 if (rnum>=(uint32_t) rmap.size())
50 rmap.resize(rnum+1, false);
51 rmap[rnum] = true;
52 }
53 readmap_loaded=true;
54 }
55
56 bool check_readmap(vector<bool>& rmap, uint32_t read_id) {
57 if (read_id>=rmap.size()) return false;
58 else return rmap[read_id];
59 }
60
61 void flt_reads_and_hits(vector<string>& reads_files) {
62 if (!readmap_loaded)
63 err_die("Error: filtering reads not enabled, aborting.");
64 if (aux_outfile.empty())
65 err_die("Error: auxiliary output file not provided.");
66 // -- filter mappings:
67 string fext=getFext(flt_mappings);
68 if (fext=="bam") {
69 samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0);
70 if (fbam==NULL)
71 err_die("Error opening BAM file %s!\n", flt_mappings.c_str());
72 bam1_t *b = bam_init1();
73 string aux_index(aux_outfile);
74 aux_index+=".index";
75 GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index);
76 while (samread(fbam, b) > 0) {
77 char* rname = bam1_qname(b);
78 uint32_t rid=(uint32_t)atol(rname);
79 if (check_readmap(readmap, rid)) {
80 //write this mapping into the output file
81 wbam.write(b, rid);
82 }
83 }
84 bam_destroy1(b);
85 samclose(fbam);
86 }
87 else {
88 bool is_sam=false;
89 string s(flt_mappings);
90 for(size_t i=0; i!=s.length(); ++i)
91 s[i] = std::tolower(s[i]);
92 if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
93 string unzcmd=getUnpackCmd(flt_mappings);
94 FZPipe hitsfile(flt_mappings, unzcmd);
95 FLineReader fh(hitsfile);
96 FZPipe outfile;
97 outfile.openWrite(aux_outfile.c_str(), zpacker);
98 if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
99 const char* line;
100 while ((line=fh.nextLine())) {
101 if (is_sam && line[0]=='@') {
102 //copy the header
103 fprintf(outfile.file, "%s\n", line);
104 continue;
105 }
106 char* tab=strchr((char*)line, '\t');
107 if (tab==NULL)
108 err_die("Error: cannot find tab character in %s mappings line:\n%s",
109 flt_mappings.c_str(),line);
110 *tab=0;
111 uint32_t rid = (uint32_t) atol(line);
112 if (rid==0)
113 err_die("Error: invalid read ID (%s) parsed from mapping file %s",
114 line, flt_mappings.c_str());
115 *tab='\t';
116 if (check_readmap(readmap, rid)) {
117 fprintf(outfile.file, "%s\n", line);
118 }
119 }
120 outfile.close();
121 hitsfile.close();
122 }
123 // -- now filter reads
124 //FILE* findex = NULL;
125 GBamWriter* wbam=NULL;
126 FILE* fout=NULL;
127 if (std_outfile.empty()) {
128 fout=stdout;
129 }
130 else { //output file name explicitely given
131 if (getFext(std_outfile)=="bam") {
132 if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
133 wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
134 //wbam = new GBamWriter(std_outfile, index_outfile);
135 }
136 else {
137 fout = fopen(std_outfile.c_str(), "w");
138 if (fout==NULL)
139 err_die("Error: cannot create file %s\n", std_outfile.c_str());
140 }
141 }
142 /*
143 if (wbam==NULL && !index_outfile.empty()) {
144 findex = fopen(index_outfile.c_str(), "w");
145 if (findex == NULL)
146 err_die("Error: cannot create file %s\n", index_outfile.c_str());
147 }
148 */
149 for (size_t fi = 0; fi < reads_files.size(); ++fi) {
150 //only one file expected here, this is not the initial prep_reads
151 Read read;
152 ReadStream readstream(reads_files[fi], NULL, true);
153 //skip_lines(fr);
154 while (readstream.get_direct(read)) {
155 uint32_t rnum=(uint32_t)atol(read.name.c_str());
156 if (check_readmap(readmap, rnum)) {
157 if (wbam) {
158 GBamRecord bamrec(read.name.c_str(), -1, 0, false,
159 read.seq.c_str(), NULL, read.qual.c_str());
160 wbam->write(bamrec.get_b(), rnum);
161 }
162 else {
163 fprintf(fout, "@%s\n%s\n+%s\n%s\n",
164 read.name.c_str(),
165 read.seq.c_str(),
166 read.alt_name.c_str(),
167 read.qual.c_str());
168 }
169 }
170 }
171 }
172 if (wbam) delete wbam;
173 if (fout && fout!=stdout) fclose(fout);
174 }
175
176
177 void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0, int matenum=0) {
178 if (wbam==NULL) return;
179 string rnum;
180 str_appendUInt(rnum,rid);
181 string rname(read.name);
182 size_t pl=rname.length()-1;
183 GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str());
184 if (matenum) {
185 if (matenum==1) bamrec.set_flag(BAM_FREAD1);
186 else if (matenum==2) bamrec.set_flag(BAM_FREAD2);
187 }
188 /*
189 int matenum=0;
190 if (rname[pl-1]=='/') {
191 if (rname[pl]=='1') {
192 bamrec.set_flag(BAM_FREAD1);
193 matenum=1;
194 }
195 else if (rname[pl]=='2') {
196 bamrec.set_flag(BAM_FREAD2);
197 matenum=2;
198 }
199 */
200 if (matenum && rname[pl-1]=='/')
201 rname.resize(pl-1);
202 // }
203
204 bamrec.add_aux("ZN", 'Z', rname.length(), (uint8_t*)rname.c_str());
205
206 if (trashcode) {
207 bamrec.set_flag(BAM_FQCFAIL);
208 bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode);
209 }
210 wbam->write(bamrec.get_b(), rid);
211 }
212
213 bool processRead(int matenum, Read& read, uint32_t next_id, int& num_reads_chucked,
214 int& multimap_chucked, GBamWriter* wbam, FILE* fout, FILE* fqindex,
215 int& min_read_len, int& max_read_len, uint64_t& fout_offset, vector<bool>& rmap) {
216 if (read.seq.length()<12) {
217 ++num_reads_chucked;
218 writePrepBam(wbam, read, next_id, 'S', matenum);
219 return false;
220 }
221 if ((int)read.seq.length()<min_read_len)
222 min_read_len=read.seq.length();
223 if ((int)read.seq.length()>max_read_len)
224 max_read_len=read.seq.length();
225
226 // daehwan - check this later, it's due to bowtie
227 if (color && read.seq[1] == '4') {
228 ++num_reads_chucked;
229 writePrepBam(wbam, read, next_id, 'c', matenum);
230 return false;
231 }
232
233 if (readmap_loaded && check_readmap(rmap, next_id)) {
234 ++num_reads_chucked;
235 ++multimap_chucked;
236 writePrepBam(wbam, read, next_id, 'M', matenum);
237 return false;
238 }
239 format_qual_string(read.qual);
240 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
241 char counts[256];
242 memset(counts, 0, sizeof(counts));
243 // Count up the bad characters
244 for (unsigned int i = 0; i != read.seq.length(); ++i)
245 {
246 char c = (char)toupper(read.seq[i]);
247 counts[(size_t)c]++;
248 }
249
250 double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
251 double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
252 double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
253 double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
254 double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
255 double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
256
257 // Chuck the read if there are at least 5 'N's or if it's mostly
258 // (>90%) 'N's and 'A's
259 char trash_code=0;
260 if (percent_A > 0.9 ||
261 percent_C > 0.9 ||
262 percent_G > 0.9 ||
263 percent_T > 0.9)
264 trash_code='L';
265 else if (percent_N >= 0.1 || percent_4 >=0.1)
266 trash_code='N';
267 if (trash_code) {
268 ++num_reads_chucked;
269 writePrepBam(wbam, read, next_id, trash_code, matenum);
270 return false;
271 }
272
273 if (wbam) {
274 if (reads_format == FASTA)
275 {
276 if (color)
277 read.qual = string(read.seq.length()-1, 'I').c_str();
278 else
279 read.qual = string(read.seq.length(), 'I').c_str();
280 }
281 writePrepBam(wbam, read, next_id, 0, matenum);
282 }
283 else {
284 // daehwan - we should not use buf in printf function
285 // because it may contain some control characters such as "\" from quality values.
286 // Here, buf is only used for calculating the file offset
287 char buf[2048] = {0};
288 if (reads_format == FASTQ or (reads_format == FASTA && quals))
289 {
290 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
291 next_id,
292 read.seq.c_str(),
293 read.name.c_str(),
294 read.qual.c_str());
295
296 fprintf(fout, "@%u\n%s\n+%s\n%s\n",
297 next_id,
298 read.seq.c_str(),
299 read.name.c_str(),
300 read.qual.c_str());
301 }
302 else if (reads_format == FASTA)
303 {
304 string qual;
305 if (color)
306 qual = string(read.seq.length()-1, 'I').c_str();
307 else
308 qual = string(read.seq.length(), 'I').c_str();
309
310 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
311 next_id,
312 read.seq.c_str(),
313 read.name.c_str(),
314 qual.c_str());
315
316 fprintf(fout, "@%u\n%s\n+%s\n%s\n",
317 next_id,
318 read.seq.c_str(),
319 read.name.c_str(),
320 qual.c_str());
321 }
322 else
323 {
324 assert(0);
325 }
326
327 if (fqindex != NULL)
328 {
329 if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0)
330 fprintf(fqindex, "%d\t%lu\n", next_id, fout_offset);
331 }
332
333 fout_offset += strlen(buf);
334 }
335 return true;
336 } //validate read
337
338
339 const char* ERR_FILE_CREATE="Error: cannot create file %s\n";
340
341 void process_reads(vector<string>& reads_fnames, vector<FZPipe>& quals_files,
342 vector<string>& mate_fnames, vector<FZPipe>& mate_quals_files)
343 {
344 //TODO: add the option to write the garbage reads into separate file(s)
345 int num_reads_chucked = 0;
346 int multimap_chucked = 0;
347 int mate_num_reads_chucked = 0;
348 int mate_multimap_chucked = 0;
349 int min_read_len = 20000000;
350 int max_read_len = 0;
351 int mate_min_read_len = 20000000;
352 int mate_max_read_len = 0;
353 uint32_t next_id = 0;
354 FILE* fw=NULL; //aux output file
355 string outfname; //std_outfile after instancing template
356 string mate_outfname;
357 string idxfname; //index_outfile after instancing template
358 string mate_idxfname;
359 bool have_mates = (mate_fnames.size() > 0);
360
361 if (!aux_outfile.empty()) {
362 fw=fopen(aux_outfile.c_str(), "w");
363 if (fw==NULL)
364 err_die(ERR_FILE_CREATE,aux_outfile.c_str());
365 }
366
367 FILE* fqindex = NULL; //fastq index
368 FILE* mate_fqindex = NULL;
369 GBamWriter* wbam=NULL;
370 GBamWriter* mate_wbam=NULL;
371 FILE* fout=NULL;
372 FILE* mate_fout=NULL;
373 uint64_t fout_offset = 0;
374 uint64_t mate_fout_offset = 0;
375 if (std_outfile.empty()) {
376 fout=stdout;
377 //for PE reads, flt_side will decide which side is printed (can't be both)
378 if (have_mates && flt_side==2)
379 err_die("Error: --flt-side option required for PE reads directed to stdout!\n");
380 }
381 else { //output file name explicitely given
382 //could be a template
383 if (std_outfile.find("%side%") != string::npos) {
384 outfname=str_replace(std_outfile, "%side%", "left");
385 if (have_mates)
386 mate_outfname=str_replace(std_outfile, "%side%", "right");
387 }
388 else {
389 outfname=std_outfile;
390 }
391 if (index_outfile.find("%side%") != string::npos) {
392 idxfname=str_replace(index_outfile, "%side%", "left");
393 if (have_mates)
394 mate_idxfname=str_replace(index_outfile, "%side%", "right");
395 }
396 else {
397 idxfname=index_outfile;
398 }
399 if (getFext(outfname)=="bam") {
400 if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
401 wbam = new GBamWriter(outfname.c_str(), sam_header.c_str(), idxfname);
402 if (!mate_outfname.empty()) {
403 mate_wbam = new GBamWriter(mate_outfname.c_str(), sam_header.c_str(), mate_idxfname);
404 }
405 }
406 else { //fastq output
407 fout = fopen(outfname.c_str(), "w");
408 if (fout==NULL)
409 err_die(ERR_FILE_CREATE, outfname.c_str());
410 mate_fout = fopen(mate_outfname.c_str(), "w");
411 if (mate_fout==NULL)
412 err_die(ERR_FILE_CREATE, mate_outfname.c_str());
413 }
414 }
415 if (wbam==NULL && !idxfname.empty()) {
416 //fastq file output, indexed
417 fqindex = fopen(idxfname.c_str(), "w");
418 if (fqindex == NULL)
419 err_die(ERR_FILE_CREATE, idxfname.c_str());
420 if (!mate_idxfname.empty()) {
421 mate_fqindex = fopen(mate_idxfname.c_str(), "w");
422 if (mate_fqindex == NULL)
423 err_die(ERR_FILE_CREATE, mate_idxfname.c_str());
424 }
425
426 }
427 for (size_t fi = 0; fi < reads_fnames.size(); ++fi)
428 {
429 Read read;
430 Read mate_read;
431 FZPipe* fq=NULL;
432 FZPipe* mate_fq=NULL;
433 if (quals)
434 fq = & quals_files[fi];
435 ReadStream reads(reads_fnames[fi], fq, true);
436 ReadStream* mate_reads=NULL;
437 if (have_mates) {
438 if (quals)
439 mate_fq = & mate_quals_files[fi];
440 mate_reads=new ReadStream(mate_fnames[fi], mate_fq, true);
441 }
442 while (reads.get_direct(read, reads_format)) {
443 // Get the next read from the file
444 //if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
445 // break;
446 ++next_id;
447 //IMPORTANT: to keep paired reads in sync, this must be
448 //incremented BEFORE any reads are chucked !
449 int matenum=0;
450 if (have_mates) {
451 if (!mate_reads->get_direct(mate_read, reads_format))
452 err_die("Error: could not retrieve mate for read '%s' !\n",read.name.c_str());
453 //check if reads are paired correctly
454 matenum=1;
455 size_t nl=read.name.length()-1;
456 size_t mate_nl=mate_read.name.length()-1;
457 bool mate_match=(nl==mate_nl);
458 if (mate_match && read.name[nl-1]=='/' ) --nl;
459 if (mate_match && strncmp(read.name.data(), mate_read.name.data(), nl+1)!=0)
460 mate_match=false;
461 if (!mate_match) {
462 err_die("Error: read #%d (%s) does not have a matching mate in the same order (%s found instead)\n",
463 next_id, read.name.c_str(), mate_read.name.c_str());
464 }
465 }
466 processRead(matenum, read, next_id, num_reads_chucked, multimap_chucked,
467 wbam, fout, fqindex, min_read_len, max_read_len, fout_offset, readmap);
468 if (mate_reads) {
469 matenum=2;
470 processRead(matenum, mate_read, next_id, mate_num_reads_chucked, mate_multimap_chucked,
471 mate_wbam, mate_fout, mate_fqindex, mate_min_read_len, mate_max_read_len,
472 mate_fout_offset, mate_readmap);
473 }
474
475 } //while !fr.isEof()
476 } //for each input file
477 fprintf(stderr, "%u out of %u reads have been filtered out\n",
478 num_reads_chucked, next_id);
479 if (readmap_loaded)
480 fprintf(stderr, "\t(%u filtered out due to %s)\n",
481 multimap_chucked, flt_reads_fnames[0].c_str());
482 if (have_mates)
483 fprintf(stderr, "%u out of %u read mates have been filtered out\n",
484 mate_num_reads_chucked, next_id);
485 if (readmap_loaded && have_mates && mate_multimap_chucked)
486 fprintf(stderr, "\t(%u mates filtered out due to %s)\n",
487 mate_multimap_chucked, flt_reads_fnames[1].c_str());
488
489 if (wbam) { delete wbam; }
490 if (mate_wbam) { delete mate_wbam; }
491 if (fout && fout!=stdout) fclose(fout);
492 if (mate_fout) fclose(mate_fout);
493 if (fw!=NULL) {
494 string side("");
495 if (have_mates)
496 side="left_";
497 fprintf(fw, "%smin_read_len=%d\n", side.c_str(), min_read_len - (color ? 1 : 0));
498 fprintf(fw, "%smax_read_len=%d\n", side.c_str(), max_read_len - (color ? 1 : 0));
499 fprintf(fw, "%sreads_in =%d\n", side.c_str(), next_id);
500 fprintf(fw, "%sreads_out=%d\n", side.c_str(), next_id-num_reads_chucked);
501 if (have_mates) {
502 side="right_";
503 fprintf(fw, "%smin_read_len=%d\n", side.c_str(), mate_min_read_len - (color ? 1 : 0));
504 fprintf(fw, "%smax_read_len=%d\n", side.c_str(), mate_max_read_len - (color ? 1 : 0));
505 fprintf(fw, "%sreads_in =%d\n", side.c_str(), next_id);
506 fprintf(fw, "%sreads_out=%d\n", side.c_str(), next_id-mate_num_reads_chucked);
507 }
508 fclose(fw);
509 }
510
511 if (fqindex) fclose(fqindex);
512 if (mate_fqindex) fclose(mate_fqindex);
513 }
514
515 void print_usage()
516 {
517 fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq> [<read_qual_files>,..] \\"
518 "[<mate_reads1.fa/fq,...,mate_readsN.fa/fq> [<mate_qual_files>,..]\n");
519 }
520
521 void open_qual_files(vector<FZPipe>& quals_files, string& quals_file_list) {
522 vector<string> quals_file_names;
523 tokenize(quals_file_list, ",", quals_file_names);
524 for (size_t i = 0; i < quals_file_names.size(); ++i)
525 {
526 FZPipe seg_file(quals_file_names[i], true);
527 if (seg_file.file == NULL)
528 {
529 fprintf(stderr, "Error: cannot open qual. file %s\n",
530 quals_file_names[i].c_str());
531 exit(1);
532 }
533 quals_files.push_back(seg_file);
534 }
535 }
536
537 int main(int argc, char *argv[])
538 {
539 fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
540 fprintf(stderr, "---------------------------\n");
541
542 int parse_ret = parse_options(argc, argv, print_usage);
543 if (parse_ret)
544 return parse_ret;
545 if(optind >= argc)
546 {
547 print_usage();
548 return 1;
549 }
550
551 string reads_file_list(argv[optind++]);
552 vector<string> reads_filenames;
553 tokenize(reads_file_list, ",",reads_filenames);
554 vector<FZPipe> quals_files;
555 if (quals)
556 {
557 if (optind>=argc) {
558 err_die("Error: quality value file(s) not provided !\n");
559 }
560 string quals_file_list = argv[optind++];
561 open_qual_files(quals_files, quals_file_list);
562 if (quals_files.size()!=reads_filenames.size())
563 err_die("Error: number of quality value files must much the number of read files!\n");
564 }
565 string mate_file_list;
566 vector<string> mate_filenames;
567 vector<FZPipe> mate_quals_files;
568 if (optind<argc)
569 mate_file_list = argv[optind++];
570
571 if (!mate_file_list.empty()) {
572 tokenize(mate_file_list, ",",mate_filenames);
573 if (reads_filenames.size()!=mate_filenames.size()) {
574 err_die("Error: same number of input files required for paired reads!\n");
575 }
576 if (quals)
577 {
578 if (optind>=argc) {
579 err_die("Error: mate quality value file(s) not provided !\n");
580 }
581 string mate_quals_file_list = argv[optind++];
582 open_qual_files(mate_quals_files, mate_quals_file_list);
583 if (mate_quals_files.size()!=mate_filenames.size())
584 err_die("Error: number of quality value files must much the number of read files!\n");
585 }
586 }
587 if (!flt_reads.empty()) {
588 //for multi-mapped prefiltering usage
589 readmap_loaded = false;
590 tokenize(flt_reads, ",", flt_reads_fnames);
591 load_readmap(flt_reads_fnames[0], readmap);
592 if (flt_reads_fnames.size()==2)
593 load_readmap(flt_reads_fnames[1], mate_readmap);
594 }
595 if (flt_mappings.empty())
596 process_reads(reads_filenames, quals_files, mate_filenames, mate_quals_files);
597 else //special use case: filter previous mappings (when prefiltering)
598 flt_reads_and_hits(reads_filenames);
599
600 return 0;
601 }