ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 223
Committed: Fri Mar 23 22:39:52 2012 UTC (12 years, 6 months ago) by gpertea
File size: 12606 byte(s)
Log Message:
Line File contents
1 /*
2 * polyA_reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 * Derived from maq "catfilter", by Heng Li at Sanger
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <stdio.h>
15 #include <cassert>
16 #include <vector>
17 #include <cstring>
18 #include <cstdlib>
19
20 #include "common.h"
21 #include "reads.h"
22 #include "tokenize.h"
23 #include "qual.h"
24
25 //bool fastq_db = true;
26
27 using namespace std;
28
29 void format_qual_string(string& qual_str)
30 {
31 for (size_t i = 0; i < qual_str.size(); ++i)
32 {
33 qual_str[i] = charToPhred33(qual_str[i],
34 solexa_quals,
35 phred64_quals);
36 }
37 }
38
39 bool readmap_loaded=false;
40 vector<bool> readmap; //for filter_multihits
41
42 void load_readmap() {
43 readmap_loaded=false;
44 if (flt_reads.empty()) return;
45 //FZPipe(filter_multihits, false);
46 ReadStream rdstream(flt_reads, NULL, true);
47 /*
48 FILE* rfile=fopen(flt_reads.c_str(),"r");
49 if (rfile==NULL)
50 err_die("Error: cannot read file %s\n", flt_reads.c_str());
51 FLineReader fr(rfile);
52 skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
53 */
54 Read read;
55 //while (next_fastx_read(fr, read, reads_format)) {
56 while (rdstream.get_direct(read, reads_format)) {
57 uint32_t rnum=(uint32_t)atol(read.name.c_str());
58 if (rnum>=(uint32_t) readmap.size())
59 readmap.resize(rnum+1, false);
60 readmap[rnum] = true;
61 }
62 readmap_loaded=true;
63 //fclose(rfile);
64 }
65
66 bool check_readmap(uint32_t read_id) {
67 if (read_id>=readmap.size()) return false;
68 else return readmap[read_id];
69 }
70
71 void flt_reads_and_hits(vector<string>& reads_files) {
72 if (!readmap_loaded)
73 err_die("Error: filtering reads not enabled, aborting.");
74 if (aux_outfile.empty())
75 err_die("Error: auxiliary output file not provided.");
76 //if (index_outfile.empty())
77 // err_die("Error: index output file not provided.");
78
79 // -- filter mappings:
80 string fext=getFext(flt_mappings);
81 if (fext=="bam") {
82 samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0);
83 if (fbam==NULL)
84 err_die("Error opening BAM file %s!\n", flt_mappings.c_str());
85 bam1_t *b = bam_init1();
86 string aux_index(aux_outfile);
87 aux_index+=".index";
88 GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index);
89 while (samread(fbam, b) > 0) {
90 char* rname = bam1_qname(b);
91 uint32_t rid=(uint32_t)atol(rname);
92 if (check_readmap(rid)) {
93 //write this mapping into the output file
94 wbam.write(b, rid);
95 }
96 }
97 bam_destroy1(b);
98 samclose(fbam);
99 }
100 else {
101 bool is_sam=false;
102 string s(flt_mappings);
103 for(size_t i=0; i!=s.length(); ++i)
104 s[i] = std::tolower(s[i]);
105 if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
106 string unzcmd=getUnpackCmd(flt_mappings);
107 FZPipe hitsfile(flt_mappings, unzcmd);
108 FLineReader fh(hitsfile);
109 FZPipe outfile;
110 outfile.openWrite(aux_outfile.c_str(), zpacker);
111 if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
112 const char* line;
113 while ((line=fh.nextLine())) {
114 if (is_sam && line[0]=='@') {
115 //copy the header
116 fprintf(outfile.file, "%s\n", line);
117 continue;
118 }
119 char* tab=strchr((char*)line, '\t');
120 if (tab==NULL)
121 err_die("Error: cannot find tab character in %s mappings line:\n%s",
122 flt_mappings.c_str(),line);
123 *tab=0;
124 uint32_t rid = (uint32_t) atol(line);
125 if (rid==0)
126 err_die("Error: invalid read ID (%s) parsed from mapping file %s",
127 line, flt_mappings.c_str());
128 *tab='\t';
129 if (check_readmap(rid)) {
130 fprintf(outfile.file, "%s\n", line);
131 }
132 }
133 outfile.close();
134 hitsfile.close();
135 }
136 // -- now filter reads
137 for (size_t fi = 0; fi < reads_files.size(); ++fi) {
138 //only one file expected here, this is not the initial prep_reads
139 Read read;
140 ReadStream readstream(reads_files[fi], NULL, true);
141 //skip_lines(fr);
142 while (readstream.get_direct(read)) {
143 uint32_t rnum=(uint32_t)atol(read.name.c_str());
144 if (check_readmap(rnum))
145 printf("@%s\n%s\n+%s\n%s\n",
146 read.name.c_str(),
147 read.seq.c_str(),
148 read.alt_name.c_str(),
149 read.qual.c_str());
150 }
151 }
152 }
153
154
155 void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) {
156 if (wbam==NULL) return;
157 string rnum;
158 str_appendUInt(rnum,rid);
159 string rname(read.name);
160 size_t pl=rname.length()-1;
161 GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str());
162 int matenum=0;
163 if (rname[pl-1]=='/') {
164 if (rname[pl]=='1') {
165 bamrec.set_flag(BAM_FREAD1);
166 matenum=1;
167 }
168 else if (rname[pl]=='2') {
169 bamrec.set_flag(BAM_FREAD2);
170 matenum=2;
171 }
172 if (matenum) rname.resize(pl-1);
173 }
174
175 bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str());
176 if (trashcode) {
177 bamrec.set_flag(BAM_FQCFAIL);
178 bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode);
179 }
180 wbam->write(bamrec.get_b(), rid);
181 }
182
183 void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files)
184 {
185 //TODO: add the option to write the garbage reads into separate file(s)
186 int num_reads_chucked = 0;
187 int multimap_chucked = 0;
188 int min_read_len = 20000000;
189 int max_read_len = 0;
190 uint32_t next_id = 0;
191 uint64_t file_offset = 0;
192 FILE* fw=NULL;
193 if (!aux_outfile.empty()) {
194 fw=fopen(aux_outfile.c_str(), "w");
195 if (fw==NULL)
196 err_die("Error: cannot create file %s\n",aux_outfile.c_str());
197 }
198
199 FILE* findex = NULL;
200 GBamWriter* wbam=NULL;
201 FILE* fout=NULL;
202 if (std_outfile.empty()) {
203 fout=stdout;
204 }
205 else { //output file name explicitely given
206 if (getFext(std_outfile)=="bam") {
207 if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
208 wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
209 //wbam = new GBamWriter(std_outfile, index_outfile);
210 }
211 else {
212 fout = fopen(std_outfile.c_str(), "w");
213 if (fout==NULL)
214 err_die("Error: cannot create file %s\n", std_outfile.c_str());
215 }
216 }
217 if (wbam==NULL && !index_outfile.empty()) {
218 findex = fopen(index_outfile.c_str(), "w");
219 if (findex == NULL)
220 err_die("Error: cannot create file %s\n", index_outfile.c_str());
221 }
222
223 for (size_t fi = 0; fi < reads_files.size(); ++fi)
224 {
225 Read read;
226 FZPipe* fq=NULL;
227 if (quals)
228 fq = & quals_files[fi];
229 ReadStream reads(reads_files[fi], fq, true);
230
231 while (reads.get_direct(read, reads_format)) {
232 // read.clear();
233 // Get the next read from the file
234 //if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
235 // break;
236
237 ++next_id;
238 //IMPORTANT: to keep paired reads in sync, this must be
239 //incremented BEFORE any reads are chucked !
240 if (read.seq.length()<12) {
241 ++num_reads_chucked;
242 writePrepBam(wbam, read, next_id, 'S');
243 continue;
244 }
245 if ((int)read.seq.length()<min_read_len)
246 min_read_len=read.seq.length();
247 if ((int)read.seq.length()>max_read_len)
248 max_read_len=read.seq.length();
249
250 // daehwan - check this later, it's due to bowtie
251 if (color && read.seq[1] == '4') {
252 ++num_reads_chucked;
253 writePrepBam(wbam, read, next_id, 'c');
254 continue;
255 }
256
257 if (readmap_loaded && check_readmap(next_id)) {
258 ++num_reads_chucked;
259 ++multimap_chucked;
260 writePrepBam(wbam, read, next_id, 'M');
261 continue;
262 }
263 format_qual_string(read.qual);
264 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
265 char counts[256];
266 memset(counts, 0, sizeof(counts));
267 // Count up the bad characters
268 for (unsigned int i = 0; i != read.seq.length(); ++i)
269 {
270 char c = (char)toupper(read.seq[i]);
271 counts[(size_t)c]++;
272 }
273
274 double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
275 double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
276 double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
277 double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
278 double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
279 double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
280
281 // Chuck the read if there are at least 5 'N's or if it's mostly
282 // (>90%) 'N's and 'A's
283 char trash_code=0;
284 if (percent_A > 0.9 ||
285 percent_C > 0.9 ||
286 percent_G > 0.9 ||
287 percent_T > 0.9)
288 trash_code='L';
289 else if (percent_N >= 0.1 || percent_4 >=0.1)
290 trash_code='N';
291 if (trash_code) {
292 ++num_reads_chucked;
293 writePrepBam(wbam, read, next_id, trash_code);
294 continue;
295 }
296
297 if (wbam) {
298 writePrepBam(wbam, read, next_id);
299 }
300 else {
301 // daehwan - we should not use buf in printf function
302 // because it may contain some control characters such as "\" from quality values.
303 // Here, buf is only used for calculating the file offset
304 char buf[2048] = {0};
305 if (reads_format == FASTQ or (reads_format == FASTA && quals))
306 {
307 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
308 next_id,
309 read.seq.c_str(),
310 read.name.c_str(),
311 read.qual.c_str());
312
313 fprintf(fout, "@%u\n%s\n+%s\n%s\n",
314 next_id,
315 read.seq.c_str(),
316 read.name.c_str(),
317 read.qual.c_str());
318 }
319 else if (reads_format == FASTA)
320 {
321 string qual;
322 if (color)
323 qual = string(read.seq.length()-1, 'I').c_str();
324 else
325 qual = string(read.seq.length(), 'I').c_str();
326
327 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
328 next_id,
329 read.seq.c_str(),
330 read.name.c_str(),
331 qual.c_str());
332
333 fprintf(fout, "@%u\n%s\n+%s\n%s\n",
334 next_id,
335 read.seq.c_str(),
336 read.name.c_str(),
337 qual.c_str());
338 }
339 else
340 {
341 assert(0);
342 }
343
344 if (findex != NULL)
345 {
346 if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0)
347 fprintf(findex, "%d\t%lu\n", next_id, file_offset);
348 }
349
350 file_offset += strlen(buf);
351 }
352 } //while !fr.isEof()
353 //fr.close();
354 //frq.close();
355 } //for each input file
356 fprintf(stderr, "%u out of %u reads have been filtered out\n",
357 num_reads_chucked, next_id);
358 if (readmap_loaded)
359 fprintf(stderr, "\t(%u filtered out due to %s)\n",
360 multimap_chucked, flt_reads.c_str());
361 if (wbam) { delete wbam; }
362 if (fout && fout!=stdout) fclose(fout);
363 if (fw!=NULL) {
364 fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
365 fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
366 fprintf(fw, "reads_in =%d\n",next_id);
367 fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
368 fclose(fw);
369 }
370
371 if (findex != NULL)
372 fclose(findex);
373 }
374
375 void print_usage()
376 {
377 fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
378 //alternate usage (--ctv_to_num) : doesn't filter out any reads,
379 //but simply convert read names to numeric ids
380 }
381
382
383 int main(int argc, char *argv[])
384 {
385 fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
386 fprintf(stderr, "---------------------------\n");
387
388 int parse_ret = parse_options(argc, argv, print_usage);
389 if (parse_ret)
390 return parse_ret;
391
392 if(optind >= argc)
393 {
394 print_usage();
395 return 1;
396 }
397
398 string reads_file_list = argv[optind++];
399
400 vector<string> reads_filenames;
401 tokenize(reads_file_list, ",",reads_filenames);
402 vector<string> quals_file_names;
403 vector<FZPipe> quals_files;
404 if (quals)
405 {
406 string quals_file_list = argv[optind++];
407 tokenize(quals_file_list, ",", quals_file_names);
408 for (size_t i = 0; i < quals_file_names.size(); ++i)
409 {
410 FZPipe seg_file(quals_file_names[i], true);
411 if (seg_file.file == NULL)
412 {
413 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
414 quals_file_names[i].c_str());
415 exit(1);
416 }
417 quals_files.push_back(seg_file);
418 }
419 }
420 load_readmap();
421 // Only print to standard out the good reads
422 //TODO: a better, more generic read filtering protocol
423 if (flt_mappings.empty())
424 process_reads(reads_filenames, quals_files);
425 else //special use case: filter previous bowtie results
426 flt_reads_and_hits(reads_filenames);
427
428 return 0;
429 }