ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (12 years, 8 months ago) by gpertea
File size: 10015 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line User Rev File contents
1 gpertea 29 /*
2     * polyA_reads.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 9/2/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     * Derived from maq "catfilter", by Heng Li at Sanger
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <stdio.h>
15     #include <cassert>
16     #include <vector>
17     #include <cstring>
18     #include <cstdlib>
19    
20     #include "common.h"
21     #include "reads.h"
22     #include "tokenize.h"
23     #include "qual.h"
24    
25     //bool fastq_db = true;
26    
27     using namespace std;
28    
29     void format_qual_string(string& qual_str)
30     {
31     for (size_t i = 0; i < qual_str.size(); ++i)
32     {
33     qual_str[i] = charToPhred33(qual_str[i],
34     solexa_quals,
35     phred64_quals);
36     }
37     }
38    
39 gpertea 135 bool readmap_loaded=false;
40     vector<bool> readmap; //for filter_multihits
41 gpertea 29
42 gpertea 135
43     void load_readmap() {
44     readmap_loaded=false;
45     if (flt_reads.empty()) return;
46     //FZPipe(filter_multihits, false);
47     FILE* rfile=fopen(flt_reads.c_str(),"r");
48     if (rfile==NULL)
49     err_die("Error: cannot read file %s\n", flt_reads.c_str());
50     FLineReader fr(rfile);
51     skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
52     Read read;
53     while (next_fastx_read(fr, read, reads_format)) {
54     uint32_t rnum=(uint32_t)atol(read.name.c_str());
55     if (rnum>=(uint32_t) readmap.size())
56     readmap.resize(rnum+1, false);
57     readmap[rnum] = true;
58     }
59     readmap_loaded=true;
60     fclose(rfile);
61     }
62    
63     bool check_readmap(uint32_t read_id) {
64     if (read_id>=readmap.size()) return false;
65     else return readmap[read_id];
66     }
67     void flt_reads_and_hits(vector<FZPipe>& reads_files) {
68     if (!readmap_loaded)
69     err_die("Error: filtering reads not enabled, aborting.");
70     if (aux_outfile.empty())
71     err_die("Error: auxiliary output file not provided.");
72     // -- filter mappings:
73     string fext=getFext(flt_mappings);
74     if (fext=="bam") {
75     //TODO
76     //WRITEME: use samtools C API to write a quick bam filter
77     err_die("Error: cannot handle BAM files yet![WRITEME]");
78     return;
79     }
80     bool is_sam=false;
81     string s(flt_mappings);
82     for(size_t i=0; i!=s.length(); ++i)
83     s[i] = std::tolower(s[i]);
84     if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
85     string unzcmd=getUnpackCmd(flt_mappings);
86     FZPipe hitsfile(flt_mappings, unzcmd);
87     FLineReader fh(hitsfile);
88     FZPipe outfile;
89     outfile.openWrite(aux_outfile.c_str(), zpacker);
90     if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
91     const char* line;
92     while ((line=fh.nextLine())) {
93     if (is_sam && line[0]=='@') {
94     fprintf(outfile.file, "%s\n", line);
95     continue;
96     }
97     char* tab=strchr((char*)line, '\t');
98     if (tab==NULL)
99     err_die("Error: cannot find tab character in %s mappings line:\n%s",
100     flt_mappings.c_str(),line);
101     *tab=0;
102     uint32_t rid = (uint32_t) atol(line);
103     if (rid==0)
104     err_die("Error: invalid read ID (%s) parsed from mapping file %s",
105     line, flt_mappings.c_str());
106     *tab='\t';
107     if (check_readmap(rid)) {
108     fprintf(outfile.file, "%s\n", line);
109     }
110     }
111     outfile.close();
112     hitsfile.close();
113     // -- now filter reads
114     for (size_t fi = 0; fi < reads_files.size(); ++fi) {
115     //only one file expected here, this is not the initial prep_reads
116     Read read;
117     FLineReader fr(reads_files[fi]);
118     //skip_lines(fr);
119     while (next_fastx_read(fr, read)) {
120     uint32_t rnum=(uint32_t)atol(read.name.c_str());
121     if (check_readmap(rnum))
122     printf("@%s\n%s\n+%s\n%s\n",
123     read.name.c_str(),
124     read.seq.c_str(),
125     read.alt_name.c_str(),
126     read.qual.c_str());
127     }
128     }
129     }
130    
131    
132     void process_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
133 gpertea 29 {
134 gpertea 135 //TODO: add the option to write the garbage reads into separate file(s)
135     int num_reads_chucked = 0;
136     int multimap_chucked=0;
137     int min_read_len = 20000000;
138 gpertea 29 int max_read_len = 0;
139 gpertea 135 uint32_t next_id = 0;
140 gpertea 29 FILE* fw=NULL;
141     if (!aux_outfile.empty()) {
142     fw=fopen(aux_outfile.c_str(), "w");
143     if (fw==NULL)
144     err_die("Error: cannot create file %s\n",aux_outfile.c_str());
145     }
146    
147     for (size_t fi = 0; fi < reads_files.size(); ++fi)
148     {
149     Read read;
150     FLineReader fr(reads_files[fi]);
151     skip_lines(fr);
152     FZPipe fq;
153     if (quals)
154 gpertea 135 fq = quals_files[fi];
155 gpertea 29 FLineReader frq(fq);
156     skip_lines(frq);
157    
158     while (!fr.isEof()) {
159     //read.clear();
160     // Get the next read from the file
161     if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
162     break;
163 gpertea 135
164     ++next_id;
165     //IMPORTANT: to keep paired reads in sync, this must be
166     //incremented BEFORE any reads are chucked !
167    
168 gpertea 29 if (read.seq.length()<12) {
169     ++num_reads_chucked;
170     continue;
171     }
172 gpertea 135 if ((int)read.seq.length()<min_read_len)
173     min_read_len=read.seq.length();
174     if ((int)read.seq.length()>max_read_len)
175     max_read_len=read.seq.length();
176    
177     // daehwan - check this later, it's due to bowtie
178     if (color && read.seq[1] == '4') {
179     ++num_reads_chucked;
180     continue;
181     }
182    
183     if (readmap_loaded && check_readmap(next_id)) {
184     ++num_reads_chucked;
185     ++multimap_chucked;
186     continue;
187     }
188     format_qual_string(read.qual);
189 gpertea 29 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
190     char counts[256];
191     memset(counts, 0, sizeof(counts));
192     // Count up the bad characters
193     for (unsigned int i = 0; i != read.seq.length(); ++i)
194     {
195     char c = (char)toupper(read.seq[i]);
196     counts[(size_t)c]++;
197     }
198    
199     double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
200     double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
201     double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
202     double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
203     double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
204     double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
205    
206     // Chuck the read if there are at least 5 'N's or if it's mostly
207     // (>90%) 'N's and 'A's
208    
209     if (percent_A > 0.9 ||
210     percent_C > 0.9 ||
211     percent_G > 0.9 ||
212     percent_T > 0.9 ||
213     percent_N >= 0.1 ||
214     percent_4 >= 0.1)
215     {
216     ++num_reads_chucked;
217     }
218     else
219     {
220     if (reads_format == FASTQ or (reads_format == FASTA && quals))
221     {
222 gpertea 135
223     printf("@%u\n%s\n+%s\n%s\n",
224     next_id,
225     read.seq.c_str(),
226     read.name.c_str(),
227     read.qual.c_str());
228 gpertea 29 }
229     else if (reads_format == FASTA)
230     {
231     string qual;
232 gpertea 135 if (color)
233     qual = string(read.seq.length()-1, 'I').c_str();
234 gpertea 29 else
235 gpertea 135 qual = string(read.seq.length(), 'I').c_str();
236 gpertea 29
237 gpertea 135 printf("@%u\n%s\n+%s\n%s\n",
238 gpertea 29 next_id,
239     read.seq.c_str(),
240     read.name.c_str(),
241     qual.c_str());
242     }
243     }
244     } //while !fr.isEof()
245     fr.close();
246     frq.close();
247     } //for each input file
248 gpertea 135 fprintf(stderr, "%u out of %u reads have been filtered out\n",
249     num_reads_chucked, next_id);
250     if (readmap_loaded)
251     fprintf(stderr, "\t(%u filtered out due to %s)\n",
252     multimap_chucked, flt_reads.c_str());
253 gpertea 29 if (fw!=NULL) {
254     fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
255     fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
256 gpertea 135 fprintf(fw, "reads_in =%d\n",next_id);
257     fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
258 gpertea 29 fclose(fw);
259     }
260     }
261    
262     void print_usage()
263     {
264 gpertea 135 fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
265     //alternate usage (--ctv_to_num) : doesn't filter out any reads,
266     //but simply convert read names to numeric ids
267 gpertea 29 }
268    
269    
270     int main(int argc, char *argv[])
271     {
272     fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
273     fprintf(stderr, "---------------------------\n");
274    
275     int parse_ret = parse_options(argc, argv, print_usage);
276     if (parse_ret)
277     return parse_ret;
278    
279     if(optind >= argc)
280     {
281     print_usage();
282     return 1;
283     }
284    
285     string reads_file_list = argv[optind++];
286    
287     vector<string> reads_file_names;
288     vector<FZPipe> reads_files;
289     tokenize(reads_file_list, ",",reads_file_names);
290     for (size_t i = 0; i < reads_file_names.size(); ++i)
291     {
292     string fname=reads_file_names[i];
293     string pipecmd=guess_packer(fname, true);
294     if (!pipecmd.empty()) pipecmd.append(" -cd");
295     FZPipe seg_file(fname,pipecmd);
296     if (seg_file.file == NULL) {
297     fprintf(stderr, "Error: cannot open reads file %s for reading\n",
298     reads_file_names[i].c_str());
299     exit(1);
300     }
301     reads_files.push_back(seg_file);
302     }
303    
304     vector<string> quals_file_names;
305     vector<FZPipe> quals_files;
306     if (quals)
307     {
308     string quals_file_list = argv[optind++];
309     tokenize(quals_file_list, ",", quals_file_names);
310     for (size_t i = 0; i < quals_file_names.size(); ++i)
311     {
312     string fname(quals_file_names[i]);
313     string pipecmd=guess_packer(fname, true);
314     FZPipe seg_file(fname, pipecmd);
315     if (seg_file.file == NULL)
316     {
317     fprintf(stderr, "Error: cannot open reads file %s for reading\n",
318     quals_file_names[i].c_str());
319     exit(1);
320     }
321     quals_files.push_back(seg_file);
322     }
323     }
324 gpertea 135 load_readmap();
325 gpertea 29 // Only print to standard out the good reads
326     //TODO: a better, more generic read filtering protocol
327 gpertea 135 if (flt_mappings.empty())
328     process_reads(reads_files, quals_files);
329     else //special use case: filter previous bowtie results
330     flt_reads_and_hits(reads_files);
331 gpertea 29
332     return 0;
333     }