ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 220
Committed: Thu Mar 22 19:33:00 2012 UTC (12 years, 6 months ago) by gpertea
File size: 12605 byte(s)
Log Message:
working, writing rejected.bam

Line User Rev File contents
1 gpertea 29 /*
2     * polyA_reads.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 9/2/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     * Derived from maq "catfilter", by Heng Li at Sanger
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <stdio.h>
15     #include <cassert>
16     #include <vector>
17     #include <cstring>
18     #include <cstdlib>
19    
20     #include "common.h"
21     #include "reads.h"
22     #include "tokenize.h"
23     #include "qual.h"
24    
25     //bool fastq_db = true;
26    
27     using namespace std;
28    
29     void format_qual_string(string& qual_str)
30     {
31     for (size_t i = 0; i < qual_str.size(); ++i)
32     {
33     qual_str[i] = charToPhred33(qual_str[i],
34     solexa_quals,
35     phred64_quals);
36     }
37     }
38    
39 gpertea 135 bool readmap_loaded=false;
40     vector<bool> readmap; //for filter_multihits
41 gpertea 29
42 gpertea 135 void load_readmap() {
43     readmap_loaded=false;
44     if (flt_reads.empty()) return;
45     //FZPipe(filter_multihits, false);
46 gpertea 218 ReadStream rdstream(flt_reads, NULL, true);
47     /*
48 gpertea 135 FILE* rfile=fopen(flt_reads.c_str(),"r");
49     if (rfile==NULL)
50     err_die("Error: cannot read file %s\n", flt_reads.c_str());
51     FLineReader fr(rfile);
52     skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
53 gpertea 218 */
54 gpertea 135 Read read;
55 gpertea 218 //while (next_fastx_read(fr, read, reads_format)) {
56     while (rdstream.get_direct(read, reads_format)) {
57 gpertea 135 uint32_t rnum=(uint32_t)atol(read.name.c_str());
58     if (rnum>=(uint32_t) readmap.size())
59     readmap.resize(rnum+1, false);
60     readmap[rnum] = true;
61     }
62     readmap_loaded=true;
63 gpertea 218 //fclose(rfile);
64 gpertea 135 }
65    
66     bool check_readmap(uint32_t read_id) {
67     if (read_id>=readmap.size()) return false;
68     else return readmap[read_id];
69     }
70 gpertea 217
71     void flt_reads_and_hits(vector<string>& reads_files) {
72 gpertea 135 if (!readmap_loaded)
73     err_die("Error: filtering reads not enabled, aborting.");
74     if (aux_outfile.empty())
75     err_die("Error: auxiliary output file not provided.");
76 gpertea 217 //if (index_outfile.empty())
77     // err_die("Error: index output file not provided.");
78 gpertea 154
79 gpertea 135 // -- filter mappings:
80     string fext=getFext(flt_mappings);
81     if (fext=="bam") {
82 gpertea 218 samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0);
83     if (fbam==NULL)
84     err_die("Error opening BAM file %s!\n", flt_mappings.c_str());
85     bam1_t *b = bam_init1();
86     string aux_index(aux_outfile);
87     aux_index+=".index";
88     GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index);
89     while (samread(fbam, b) > 0) {
90     char* rname = bam1_qname(b);
91     uint32_t rid=(uint32_t)atol(rname);
92     if (check_readmap(rid)) {
93     //write this mapping into the output file
94     wbam.write(b, rid);
95     }
96     }
97     bam_destroy1(b);
98     samclose(fbam);
99 gpertea 135 }
100 gpertea 218 else {
101     bool is_sam=false;
102     string s(flt_mappings);
103     for(size_t i=0; i!=s.length(); ++i)
104     s[i] = std::tolower(s[i]);
105     if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
106     string unzcmd=getUnpackCmd(flt_mappings);
107     FZPipe hitsfile(flt_mappings, unzcmd);
108     FLineReader fh(hitsfile);
109     FZPipe outfile;
110     outfile.openWrite(aux_outfile.c_str(), zpacker);
111     if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
112     const char* line;
113     while ((line=fh.nextLine())) {
114     if (is_sam && line[0]=='@') {
115     //copy the header
116     fprintf(outfile.file, "%s\n", line);
117     continue;
118     }
119     char* tab=strchr((char*)line, '\t');
120     if (tab==NULL)
121     err_die("Error: cannot find tab character in %s mappings line:\n%s",
122     flt_mappings.c_str(),line);
123     *tab=0;
124     uint32_t rid = (uint32_t) atol(line);
125     if (rid==0)
126     err_die("Error: invalid read ID (%s) parsed from mapping file %s",
127     line, flt_mappings.c_str());
128     *tab='\t';
129     if (check_readmap(rid)) {
130     fprintf(outfile.file, "%s\n", line);
131     }
132     }
133     outfile.close();
134     hitsfile.close();
135     }
136 gpertea 135 // -- now filter reads
137     for (size_t fi = 0; fi < reads_files.size(); ++fi) {
138     //only one file expected here, this is not the initial prep_reads
139     Read read;
140 gpertea 217 ReadStream readstream(reads_files[fi], NULL, true);
141 gpertea 135 //skip_lines(fr);
142 gpertea 217 while (readstream.get_direct(read)) {
143 gpertea 135 uint32_t rnum=(uint32_t)atol(read.name.c_str());
144     if (check_readmap(rnum))
145     printf("@%s\n%s\n+%s\n%s\n",
146     read.name.c_str(),
147     read.seq.c_str(),
148     read.alt_name.c_str(),
149     read.qual.c_str());
150     }
151     }
152     }
153    
154    
155 gpertea 218 void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) {
156     if (wbam==NULL) return;
157 gpertea 220 string rnum;
158     str_appendInt(rnum,rid);
159     string rname(read.name);
160     size_t pl=rname.length()-1;
161     GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str());
162     int matenum=0;
163     if (rname[pl-1]=='/') {
164     if (rname[pl]=='1') {
165     bamrec.set_flag(BAM_FREAD1);
166     matenum=1;
167     }
168     else if (rname[pl]=='2') {
169     bamrec.set_flag(BAM_FREAD2);
170     matenum=2;
171     }
172     if (matenum) rname.resize(pl-1);
173     }
174    
175     bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str());
176 gpertea 218 if (trashcode) {
177 gpertea 219 bamrec.set_flag(BAM_FQCFAIL);
178 gpertea 218 bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode);
179     }
180     wbam->write(bamrec.get_b(), rid);
181     }
182    
183 gpertea 217 void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files)
184 gpertea 29 {
185 gpertea 135 //TODO: add the option to write the garbage reads into separate file(s)
186     int num_reads_chucked = 0;
187 gpertea 154 int multimap_chucked = 0;
188 gpertea 135 int min_read_len = 20000000;
189 gpertea 29 int max_read_len = 0;
190 gpertea 135 uint32_t next_id = 0;
191 gpertea 154 uint64_t file_offset = 0;
192 gpertea 29 FILE* fw=NULL;
193     if (!aux_outfile.empty()) {
194     fw=fopen(aux_outfile.c_str(), "w");
195     if (fw==NULL)
196     err_die("Error: cannot create file %s\n",aux_outfile.c_str());
197     }
198    
199 gpertea 154 FILE* findex = NULL;
200 gpertea 217 GBamWriter* wbam=NULL;
201     FILE* fout=NULL;
202     if (std_outfile.empty()) {
203     fout=stdout;
204     }
205     else { //output file name explicitely given
206     if (getFext(std_outfile)=="bam") {
207     if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
208     wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
209 gpertea 218 //wbam = new GBamWriter(std_outfile, index_outfile);
210 gpertea 217 }
211     else {
212     fout = fopen(std_outfile.c_str(), "w");
213     if (fout==NULL)
214     err_die("Error: cannot create file %s\n", std_outfile.c_str());
215     }
216     }
217     if (wbam==NULL && !index_outfile.empty()) {
218 gpertea 154 findex = fopen(index_outfile.c_str(), "w");
219     if (findex == NULL)
220     err_die("Error: cannot create file %s\n", index_outfile.c_str());
221     }
222    
223 gpertea 29 for (size_t fi = 0; fi < reads_files.size(); ++fi)
224     {
225     Read read;
226 gpertea 207 FZPipe* fq=NULL;
227 gpertea 29 if (quals)
228 gpertea 207 fq = & quals_files[fi];
229 gpertea 217 ReadStream reads(reads_files[fi], fq, true);
230 gpertea 29
231 gpertea 207 while (reads.get_direct(read, reads_format)) {
232     // read.clear();
233 gpertea 29 // Get the next read from the file
234 gpertea 207 //if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
235     // break;
236 gpertea 135
237     ++next_id;
238     //IMPORTANT: to keep paired reads in sync, this must be
239     //incremented BEFORE any reads are chucked !
240 gpertea 29 if (read.seq.length()<12) {
241     ++num_reads_chucked;
242 gpertea 218 writePrepBam(wbam, read, next_id, 'S');
243 gpertea 29 continue;
244     }
245 gpertea 135 if ((int)read.seq.length()<min_read_len)
246     min_read_len=read.seq.length();
247     if ((int)read.seq.length()>max_read_len)
248     max_read_len=read.seq.length();
249    
250     // daehwan - check this later, it's due to bowtie
251     if (color && read.seq[1] == '4') {
252     ++num_reads_chucked;
253 gpertea 218 writePrepBam(wbam, read, next_id, 'c');
254 gpertea 135 continue;
255     }
256    
257     if (readmap_loaded && check_readmap(next_id)) {
258     ++num_reads_chucked;
259     ++multimap_chucked;
260 gpertea 218 writePrepBam(wbam, read, next_id, 'M');
261 gpertea 135 continue;
262     }
263 gpertea 207 format_qual_string(read.qual);
264 gpertea 29 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
265     char counts[256];
266     memset(counts, 0, sizeof(counts));
267     // Count up the bad characters
268     for (unsigned int i = 0; i != read.seq.length(); ++i)
269     {
270     char c = (char)toupper(read.seq[i]);
271     counts[(size_t)c]++;
272     }
273    
274     double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
275     double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
276     double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
277     double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
278     double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
279     double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
280    
281     // Chuck the read if there are at least 5 'N's or if it's mostly
282     // (>90%) 'N's and 'A's
283 gpertea 218 char trash_code=0;
284 gpertea 220 if (percent_A > 0.8 ||
285     percent_C > 0.8 ||
286     percent_G > 0.8 ||
287     percent_T > 0.8)
288 gpertea 218 trash_code='L';
289     else if (percent_N >= 0.1 || percent_4 >=0.1)
290     trash_code='N';
291     if (trash_code) {
292     ++num_reads_chucked;
293     writePrepBam(wbam, read, next_id, trash_code);
294     continue;
295     }
296 gpertea 135
297 gpertea 218 if (wbam) {
298     writePrepBam(wbam, read, next_id);
299     }
300     else {
301     // daehwan - we should not use buf in printf function
302     // because it may contain some control characters such as "\" from quality values.
303     // Here, buf is only used for calculating the file offset
304     char buf[2048] = {0};
305     if (reads_format == FASTQ or (reads_format == FASTA && quals))
306     {
307     sprintf(buf, "@%u\n%s\n+%s\n%s\n",
308     next_id,
309     read.seq.c_str(),
310     read.name.c_str(),
311     read.qual.c_str());
312 gpertea 29
313 gpertea 218 fprintf(fout, "@%u\n%s\n+%s\n%s\n",
314     next_id,
315     read.seq.c_str(),
316     read.name.c_str(),
317     read.qual.c_str());
318     }
319     else if (reads_format == FASTA)
320     {
321     string qual;
322     if (color)
323     qual = string(read.seq.length()-1, 'I').c_str();
324     else
325     qual = string(read.seq.length(), 'I').c_str();
326 gpertea 154
327 gpertea 218 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
328     next_id,
329     read.seq.c_str(),
330     read.name.c_str(),
331     qual.c_str());
332 gpertea 154
333 gpertea 218 fprintf(fout, "@%u\n%s\n+%s\n%s\n",
334     next_id,
335     read.seq.c_str(),
336     read.name.c_str(),
337     qual.c_str());
338     }
339     else
340     {
341     assert(0);
342     }
343 gpertea 154
344 gpertea 218 if (findex != NULL)
345     {
346 gpertea 219 if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0)
347 gpertea 218 fprintf(findex, "%d\t%lu\n", next_id, file_offset);
348     }
349    
350     file_offset += strlen(buf);
351 gpertea 29 }
352     } //while !fr.isEof()
353 gpertea 207 //fr.close();
354     //frq.close();
355 gpertea 29 } //for each input file
356 gpertea 135 fprintf(stderr, "%u out of %u reads have been filtered out\n",
357     num_reads_chucked, next_id);
358     if (readmap_loaded)
359     fprintf(stderr, "\t(%u filtered out due to %s)\n",
360     multimap_chucked, flt_reads.c_str());
361 gpertea 217 if (wbam) { delete wbam; }
362 gpertea 218 if (fout && fout!=stdout) fclose(fout);
363 gpertea 29 if (fw!=NULL) {
364     fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
365     fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
366 gpertea 135 fprintf(fw, "reads_in =%d\n",next_id);
367     fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
368 gpertea 29 fclose(fw);
369     }
370 gpertea 154
371     if (findex != NULL)
372     fclose(findex);
373 gpertea 29 }
374    
375     void print_usage()
376     {
377 gpertea 135 fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
378     //alternate usage (--ctv_to_num) : doesn't filter out any reads,
379     //but simply convert read names to numeric ids
380 gpertea 29 }
381    
382    
383     int main(int argc, char *argv[])
384     {
385     fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
386     fprintf(stderr, "---------------------------\n");
387    
388     int parse_ret = parse_options(argc, argv, print_usage);
389     if (parse_ret)
390     return parse_ret;
391    
392     if(optind >= argc)
393     {
394     print_usage();
395     return 1;
396     }
397    
398     string reads_file_list = argv[optind++];
399    
400 gpertea 217 vector<string> reads_filenames;
401     tokenize(reads_file_list, ",",reads_filenames);
402 gpertea 29 vector<string> quals_file_names;
403     vector<FZPipe> quals_files;
404     if (quals)
405     {
406     string quals_file_list = argv[optind++];
407     tokenize(quals_file_list, ",", quals_file_names);
408     for (size_t i = 0; i < quals_file_names.size(); ++i)
409     {
410 gpertea 211 FZPipe seg_file(quals_file_names[i], true);
411 gpertea 29 if (seg_file.file == NULL)
412     {
413     fprintf(stderr, "Error: cannot open reads file %s for reading\n",
414     quals_file_names[i].c_str());
415     exit(1);
416     }
417     quals_files.push_back(seg_file);
418     }
419     }
420 gpertea 135 load_readmap();
421 gpertea 29 // Only print to standard out the good reads
422     //TODO: a better, more generic read filtering protocol
423 gpertea 135 if (flt_mappings.empty())
424 gpertea 217 process_reads(reads_filenames, quals_files);
425 gpertea 135 else //special use case: filter previous bowtie results
426 gpertea 217 flt_reads_and_hits(reads_filenames);
427 gpertea 29
428     return 0;
429     }