ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 206
Committed: Mon Mar 12 18:02:35 2012 UTC (12 years, 5 months ago) by gpertea
File size: 10965 byte(s)
Log Message:
updated

Line User Rev File contents
1 gpertea 29 /*
2     * polyA_reads.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 9/2/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     * Derived from maq "catfilter", by Heng Li at Sanger
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <stdio.h>
15     #include <cassert>
16     #include <vector>
17     #include <cstring>
18     #include <cstdlib>
19    
20     #include "common.h"
21     #include "reads.h"
22     #include "tokenize.h"
23     #include "qual.h"
24    
25     //bool fastq_db = true;
26    
27     using namespace std;
28    
29     void format_qual_string(string& qual_str)
30     {
31     for (size_t i = 0; i < qual_str.size(); ++i)
32     {
33     qual_str[i] = charToPhred33(qual_str[i],
34     solexa_quals,
35     phred64_quals);
36     }
37     }
38    
39 gpertea 135 bool readmap_loaded=false;
40     vector<bool> readmap; //for filter_multihits
41 gpertea 29
42 gpertea 135 void load_readmap() {
43     readmap_loaded=false;
44     if (flt_reads.empty()) return;
45     //FZPipe(filter_multihits, false);
46     FILE* rfile=fopen(flt_reads.c_str(),"r");
47     if (rfile==NULL)
48     err_die("Error: cannot read file %s\n", flt_reads.c_str());
49     FLineReader fr(rfile);
50     skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
51     Read read;
52     while (next_fastx_read(fr, read, reads_format)) {
53     uint32_t rnum=(uint32_t)atol(read.name.c_str());
54     if (rnum>=(uint32_t) readmap.size())
55     readmap.resize(rnum+1, false);
56     readmap[rnum] = true;
57     }
58     readmap_loaded=true;
59     fclose(rfile);
60     }
61    
62     bool check_readmap(uint32_t read_id) {
63     if (read_id>=readmap.size()) return false;
64     else return readmap[read_id];
65     }
66     void flt_reads_and_hits(vector<FZPipe>& reads_files) {
67     if (!readmap_loaded)
68     err_die("Error: filtering reads not enabled, aborting.");
69     if (aux_outfile.empty())
70     err_die("Error: auxiliary output file not provided.");
71 gpertea 154 if (index_outfile.empty())
72     err_die("Error: index output file not provided.");
73    
74 gpertea 135 // -- filter mappings:
75     string fext=getFext(flt_mappings);
76     if (fext=="bam") {
77     //TODO
78     //WRITEME: use samtools C API to write a quick bam filter
79     err_die("Error: cannot handle BAM files yet![WRITEME]");
80     return;
81     }
82     bool is_sam=false;
83     string s(flt_mappings);
84     for(size_t i=0; i!=s.length(); ++i)
85     s[i] = std::tolower(s[i]);
86     if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
87     string unzcmd=getUnpackCmd(flt_mappings);
88     FZPipe hitsfile(flt_mappings, unzcmd);
89     FLineReader fh(hitsfile);
90     FZPipe outfile;
91     outfile.openWrite(aux_outfile.c_str(), zpacker);
92     if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
93     const char* line;
94     while ((line=fh.nextLine())) {
95     if (is_sam && line[0]=='@') {
96     fprintf(outfile.file, "%s\n", line);
97     continue;
98     }
99     char* tab=strchr((char*)line, '\t');
100     if (tab==NULL)
101     err_die("Error: cannot find tab character in %s mappings line:\n%s",
102     flt_mappings.c_str(),line);
103     *tab=0;
104     uint32_t rid = (uint32_t) atol(line);
105     if (rid==0)
106     err_die("Error: invalid read ID (%s) parsed from mapping file %s",
107     line, flt_mappings.c_str());
108     *tab='\t';
109     if (check_readmap(rid)) {
110     fprintf(outfile.file, "%s\n", line);
111     }
112     }
113     outfile.close();
114     hitsfile.close();
115     // -- now filter reads
116     for (size_t fi = 0; fi < reads_files.size(); ++fi) {
117     //only one file expected here, this is not the initial prep_reads
118     Read read;
119     FLineReader fr(reads_files[fi]);
120     //skip_lines(fr);
121     while (next_fastx_read(fr, read)) {
122     uint32_t rnum=(uint32_t)atol(read.name.c_str());
123     if (check_readmap(rnum))
124     printf("@%s\n%s\n+%s\n%s\n",
125     read.name.c_str(),
126     read.seq.c_str(),
127     read.alt_name.c_str(),
128     read.qual.c_str());
129     }
130     }
131     }
132    
133    
134     void process_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
135 gpertea 29 {
136 gpertea 135 //TODO: add the option to write the garbage reads into separate file(s)
137     int num_reads_chucked = 0;
138 gpertea 154 int multimap_chucked = 0;
139 gpertea 135 int min_read_len = 20000000;
140 gpertea 29 int max_read_len = 0;
141 gpertea 135 uint32_t next_id = 0;
142 gpertea 154 uint64_t file_offset = 0;
143 gpertea 29 FILE* fw=NULL;
144     if (!aux_outfile.empty()) {
145     fw=fopen(aux_outfile.c_str(), "w");
146     if (fw==NULL)
147     err_die("Error: cannot create file %s\n",aux_outfile.c_str());
148     }
149    
150 gpertea 154 FILE* findex = NULL;
151     if (!index_outfile.empty()) {
152     findex = fopen(index_outfile.c_str(), "w");
153     if (findex == NULL)
154     err_die("Error: cannot create file %s\n", index_outfile.c_str());
155     }
156    
157 gpertea 29 for (size_t fi = 0; fi < reads_files.size(); ++fi)
158     {
159     Read read;
160     FLineReader fr(reads_files[fi]);
161     skip_lines(fr);
162     FZPipe fq;
163     if (quals)
164 gpertea 206 fq = quals_files[fi];
165 gpertea 154
166 gpertea 29 FLineReader frq(fq);
167     skip_lines(frq);
168    
169     while (!fr.isEof()) {
170     //read.clear();
171     // Get the next read from the file
172     if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
173     break;
174 gpertea 135
175     ++next_id;
176     //IMPORTANT: to keep paired reads in sync, this must be
177     //incremented BEFORE any reads are chucked !
178    
179 gpertea 29 if (read.seq.length()<12) {
180     ++num_reads_chucked;
181     continue;
182     }
183 gpertea 135 if ((int)read.seq.length()<min_read_len)
184     min_read_len=read.seq.length();
185     if ((int)read.seq.length()>max_read_len)
186     max_read_len=read.seq.length();
187    
188     // daehwan - check this later, it's due to bowtie
189     if (color && read.seq[1] == '4') {
190     ++num_reads_chucked;
191     continue;
192     }
193    
194     if (readmap_loaded && check_readmap(next_id)) {
195     ++num_reads_chucked;
196     ++multimap_chucked;
197     continue;
198     }
199 gpertea 154 format_qual_string(read.qual);
200 gpertea 29 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
201     char counts[256];
202     memset(counts, 0, sizeof(counts));
203     // Count up the bad characters
204     for (unsigned int i = 0; i != read.seq.length(); ++i)
205     {
206     char c = (char)toupper(read.seq[i]);
207     counts[(size_t)c]++;
208     }
209    
210     double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
211     double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
212     double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
213     double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
214     double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
215     double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
216    
217     // Chuck the read if there are at least 5 'N's or if it's mostly
218     // (>90%) 'N's and 'A's
219    
220     if (percent_A > 0.9 ||
221     percent_C > 0.9 ||
222     percent_G > 0.9 ||
223     percent_T > 0.9 ||
224     percent_N >= 0.1 ||
225     percent_4 >= 0.1)
226     {
227     ++num_reads_chucked;
228     }
229     else
230     {
231 gpertea 154 // daehwan - we should not use buf in printf function
232     // because it may contain some control characters such as "\" from quality values.
233     // Here, buf is only used for calculating the file offset
234     char buf[2048] = {0};
235 gpertea 29 if (reads_format == FASTQ or (reads_format == FASTA && quals))
236     {
237 gpertea 154 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
238     next_id,
239     read.seq.c_str(),
240     read.name.c_str(),
241     read.qual.c_str());
242 gpertea 135
243 gpertea 154 printf("@%u\n%s\n+%s\n%s\n",
244     next_id,
245     read.seq.c_str(),
246     read.name.c_str(),
247     read.qual.c_str());
248 gpertea 29 }
249     else if (reads_format == FASTA)
250     {
251     string qual;
252 gpertea 135 if (color)
253     qual = string(read.seq.length()-1, 'I').c_str();
254 gpertea 29 else
255 gpertea 135 qual = string(read.seq.length(), 'I').c_str();
256 gpertea 29
257 gpertea 154 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
258     next_id,
259     read.seq.c_str(),
260     read.name.c_str(),
261     qual.c_str());
262    
263     printf("@%u\n%s\n+%s\n%s\n",
264     next_id,
265     read.seq.c_str(),
266     read.name.c_str(),
267     qual.c_str());
268 gpertea 29 }
269 gpertea 154 else
270     {
271     assert(0);
272     }
273    
274     if (findex != NULL)
275     {
276     if ((next_id - num_reads_chucked) % 1000 == 0)
277     fprintf(findex, "%d\t%lu\n", next_id, file_offset);
278     }
279    
280     file_offset += strlen(buf);
281 gpertea 29 }
282     } //while !fr.isEof()
283     fr.close();
284     frq.close();
285     } //for each input file
286 gpertea 135 fprintf(stderr, "%u out of %u reads have been filtered out\n",
287     num_reads_chucked, next_id);
288     if (readmap_loaded)
289     fprintf(stderr, "\t(%u filtered out due to %s)\n",
290     multimap_chucked, flt_reads.c_str());
291 gpertea 29 if (fw!=NULL) {
292     fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
293     fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
294 gpertea 135 fprintf(fw, "reads_in =%d\n",next_id);
295     fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
296 gpertea 29 fclose(fw);
297     }
298 gpertea 154
299     if (findex != NULL)
300     fclose(findex);
301 gpertea 29 }
302    
303     void print_usage()
304     {
305 gpertea 135 fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
306     //alternate usage (--ctv_to_num) : doesn't filter out any reads,
307     //but simply convert read names to numeric ids
308 gpertea 29 }
309    
310    
311     int main(int argc, char *argv[])
312     {
313     fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
314     fprintf(stderr, "---------------------------\n");
315    
316     int parse_ret = parse_options(argc, argv, print_usage);
317     if (parse_ret)
318     return parse_ret;
319    
320     if(optind >= argc)
321     {
322     print_usage();
323     return 1;
324     }
325    
326     string reads_file_list = argv[optind++];
327    
328     vector<string> reads_file_names;
329     vector<FZPipe> reads_files;
330     tokenize(reads_file_list, ",",reads_file_names);
331     for (size_t i = 0; i < reads_file_names.size(); ++i)
332     {
333     string fname=reads_file_names[i];
334     string pipecmd=guess_packer(fname, true);
335     if (!pipecmd.empty()) pipecmd.append(" -cd");
336     FZPipe seg_file(fname,pipecmd);
337     if (seg_file.file == NULL) {
338     fprintf(stderr, "Error: cannot open reads file %s for reading\n",
339     reads_file_names[i].c_str());
340     exit(1);
341     }
342     reads_files.push_back(seg_file);
343     }
344    
345     vector<string> quals_file_names;
346     vector<FZPipe> quals_files;
347     if (quals)
348     {
349     string quals_file_list = argv[optind++];
350     tokenize(quals_file_list, ",", quals_file_names);
351     for (size_t i = 0; i < quals_file_names.size(); ++i)
352     {
353     string fname(quals_file_names[i]);
354     string pipecmd=guess_packer(fname, true);
355     FZPipe seg_file(fname, pipecmd);
356     if (seg_file.file == NULL)
357     {
358     fprintf(stderr, "Error: cannot open reads file %s for reading\n",
359     quals_file_names[i].c_str());
360     exit(1);
361     }
362     quals_files.push_back(seg_file);
363     }
364     }
365 gpertea 135 load_readmap();
366 gpertea 29 // Only print to standard out the good reads
367     //TODO: a better, more generic read filtering protocol
368 gpertea 135 if (flt_mappings.empty())
369     process_reads(reads_files, quals_files);
370     else //special use case: filter previous bowtie results
371     flt_reads_and_hits(reads_files);
372 gpertea 29
373     return 0;
374     }