1 |
/* |
2 |
* polyA_reads.cpp |
3 |
* TopHat |
4 |
* |
5 |
* Created by Cole Trapnell on 9/2/08. |
6 |
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
* Derived from maq "catfilter", by Heng Li at Sanger |
8 |
*/ |
9 |
|
10 |
#ifdef HAVE_CONFIG_H |
11 |
#include <config.h> |
12 |
#endif |
13 |
|
14 |
#include <stdio.h> |
15 |
#include <cassert> |
16 |
#include <vector> |
17 |
#include <cstring> |
18 |
#include <cstdlib> |
19 |
|
20 |
#include "common.h" |
21 |
#include "reads.h" |
22 |
#include "tokenize.h" |
23 |
#include "qual.h" |
24 |
|
25 |
//bool fastq_db = true; |
26 |
|
27 |
using namespace std; |
28 |
|
29 |
void format_qual_string(string& qual_str) |
30 |
{ |
31 |
for (size_t i = 0; i < qual_str.size(); ++i) |
32 |
{ |
33 |
qual_str[i] = charToPhred33(qual_str[i], |
34 |
solexa_quals, |
35 |
phred64_quals); |
36 |
} |
37 |
} |
38 |
|
39 |
bool readmap_loaded=false; |
40 |
vector<bool> readmap; //for filter_multihits |
41 |
|
42 |
void load_readmap() { |
43 |
readmap_loaded=false; |
44 |
if (flt_reads.empty()) return; |
45 |
//FZPipe(filter_multihits, false); |
46 |
ReadStream rdstream(flt_reads, NULL, true); |
47 |
/* |
48 |
FILE* rfile=fopen(flt_reads.c_str(),"r"); |
49 |
if (rfile==NULL) |
50 |
err_die("Error: cannot read file %s\n", flt_reads.c_str()); |
51 |
FLineReader fr(rfile); |
52 |
skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this |
53 |
*/ |
54 |
Read read; |
55 |
//while (next_fastx_read(fr, read, reads_format)) { |
56 |
while (rdstream.get_direct(read, reads_format)) { |
57 |
uint32_t rnum=(uint32_t)atol(read.name.c_str()); |
58 |
if (rnum>=(uint32_t) readmap.size()) |
59 |
readmap.resize(rnum+1, false); |
60 |
readmap[rnum] = true; |
61 |
} |
62 |
readmap_loaded=true; |
63 |
//fclose(rfile); |
64 |
} |
65 |
|
66 |
bool check_readmap(uint32_t read_id) { |
67 |
if (read_id>=readmap.size()) return false; |
68 |
else return readmap[read_id]; |
69 |
} |
70 |
|
71 |
void flt_reads_and_hits(vector<string>& reads_files) { |
72 |
if (!readmap_loaded) |
73 |
err_die("Error: filtering reads not enabled, aborting."); |
74 |
if (aux_outfile.empty()) |
75 |
err_die("Error: auxiliary output file not provided."); |
76 |
//if (index_outfile.empty()) |
77 |
// err_die("Error: index output file not provided."); |
78 |
|
79 |
// -- filter mappings: |
80 |
string fext=getFext(flt_mappings); |
81 |
if (fext=="bam") { |
82 |
samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0); |
83 |
if (fbam==NULL) |
84 |
err_die("Error opening BAM file %s!\n", flt_mappings.c_str()); |
85 |
bam1_t *b = bam_init1(); |
86 |
string aux_index(aux_outfile); |
87 |
aux_index+=".index"; |
88 |
GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index); |
89 |
while (samread(fbam, b) > 0) { |
90 |
char* rname = bam1_qname(b); |
91 |
uint32_t rid=(uint32_t)atol(rname); |
92 |
if (check_readmap(rid)) { |
93 |
//write this mapping into the output file |
94 |
wbam.write(b, rid); |
95 |
} |
96 |
} |
97 |
bam_destroy1(b); |
98 |
samclose(fbam); |
99 |
} |
100 |
else { |
101 |
bool is_sam=false; |
102 |
string s(flt_mappings); |
103 |
for(size_t i=0; i!=s.length(); ++i) |
104 |
s[i] = std::tolower(s[i]); |
105 |
if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true; |
106 |
string unzcmd=getUnpackCmd(flt_mappings); |
107 |
FZPipe hitsfile(flt_mappings, unzcmd); |
108 |
FLineReader fh(hitsfile); |
109 |
FZPipe outfile; |
110 |
outfile.openWrite(aux_outfile.c_str(), zpacker); |
111 |
if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str()); |
112 |
const char* line; |
113 |
while ((line=fh.nextLine())) { |
114 |
if (is_sam && line[0]=='@') { |
115 |
//copy the header |
116 |
fprintf(outfile.file, "%s\n", line); |
117 |
continue; |
118 |
} |
119 |
char* tab=strchr((char*)line, '\t'); |
120 |
if (tab==NULL) |
121 |
err_die("Error: cannot find tab character in %s mappings line:\n%s", |
122 |
flt_mappings.c_str(),line); |
123 |
*tab=0; |
124 |
uint32_t rid = (uint32_t) atol(line); |
125 |
if (rid==0) |
126 |
err_die("Error: invalid read ID (%s) parsed from mapping file %s", |
127 |
line, flt_mappings.c_str()); |
128 |
*tab='\t'; |
129 |
if (check_readmap(rid)) { |
130 |
fprintf(outfile.file, "%s\n", line); |
131 |
} |
132 |
} |
133 |
outfile.close(); |
134 |
hitsfile.close(); |
135 |
} |
136 |
// -- now filter reads |
137 |
for (size_t fi = 0; fi < reads_files.size(); ++fi) { |
138 |
//only one file expected here, this is not the initial prep_reads |
139 |
Read read; |
140 |
ReadStream readstream(reads_files[fi], NULL, true); |
141 |
//skip_lines(fr); |
142 |
while (readstream.get_direct(read)) { |
143 |
uint32_t rnum=(uint32_t)atol(read.name.c_str()); |
144 |
if (check_readmap(rnum)) |
145 |
printf("@%s\n%s\n+%s\n%s\n", |
146 |
read.name.c_str(), |
147 |
read.seq.c_str(), |
148 |
read.alt_name.c_str(), |
149 |
read.qual.c_str()); |
150 |
} |
151 |
} |
152 |
} |
153 |
|
154 |
|
155 |
void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) { |
156 |
if (wbam==NULL) return; |
157 |
string rnum; |
158 |
str_appendUInt(rnum,rid); |
159 |
string rname(read.name); |
160 |
size_t pl=rname.length()-1; |
161 |
GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str()); |
162 |
int matenum=0; |
163 |
if (rname[pl-1]=='/') { |
164 |
if (rname[pl]=='1') { |
165 |
bamrec.set_flag(BAM_FREAD1); |
166 |
matenum=1; |
167 |
} |
168 |
else if (rname[pl]=='2') { |
169 |
bamrec.set_flag(BAM_FREAD2); |
170 |
matenum=2; |
171 |
} |
172 |
if (matenum) rname.resize(pl-1); |
173 |
} |
174 |
|
175 |
bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str()); |
176 |
if (trashcode) { |
177 |
bamrec.set_flag(BAM_FQCFAIL); |
178 |
bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode); |
179 |
} |
180 |
wbam->write(bamrec.get_b(), rid); |
181 |
} |
182 |
|
183 |
void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files) |
184 |
{ |
185 |
//TODO: add the option to write the garbage reads into separate file(s) |
186 |
int num_reads_chucked = 0; |
187 |
int multimap_chucked = 0; |
188 |
int min_read_len = 20000000; |
189 |
int max_read_len = 0; |
190 |
uint32_t next_id = 0; |
191 |
uint64_t file_offset = 0; |
192 |
FILE* fw=NULL; |
193 |
if (!aux_outfile.empty()) { |
194 |
fw=fopen(aux_outfile.c_str(), "w"); |
195 |
if (fw==NULL) |
196 |
err_die("Error: cannot create file %s\n",aux_outfile.c_str()); |
197 |
} |
198 |
|
199 |
FILE* findex = NULL; |
200 |
GBamWriter* wbam=NULL; |
201 |
FILE* fout=NULL; |
202 |
if (std_outfile.empty()) { |
203 |
fout=stdout; |
204 |
} |
205 |
else { //output file name explicitely given |
206 |
if (getFext(std_outfile)=="bam") { |
207 |
if (sam_header.empty()) err_die("Error: sam header file not provided.\n"); |
208 |
wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile); |
209 |
//wbam = new GBamWriter(std_outfile, index_outfile); |
210 |
} |
211 |
else { |
212 |
fout = fopen(std_outfile.c_str(), "w"); |
213 |
if (fout==NULL) |
214 |
err_die("Error: cannot create file %s\n", std_outfile.c_str()); |
215 |
} |
216 |
} |
217 |
if (wbam==NULL && !index_outfile.empty()) { |
218 |
findex = fopen(index_outfile.c_str(), "w"); |
219 |
if (findex == NULL) |
220 |
err_die("Error: cannot create file %s\n", index_outfile.c_str()); |
221 |
} |
222 |
|
223 |
for (size_t fi = 0; fi < reads_files.size(); ++fi) |
224 |
{ |
225 |
Read read; |
226 |
FZPipe* fq=NULL; |
227 |
if (quals) |
228 |
fq = & quals_files[fi]; |
229 |
ReadStream reads(reads_files[fi], fq, true); |
230 |
|
231 |
while (reads.get_direct(read, reads_format)) { |
232 |
// read.clear(); |
233 |
// Get the next read from the file |
234 |
//if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL))) |
235 |
// break; |
236 |
|
237 |
++next_id; |
238 |
//IMPORTANT: to keep paired reads in sync, this must be |
239 |
//incremented BEFORE any reads are chucked ! |
240 |
if (read.seq.length()<12) { |
241 |
++num_reads_chucked; |
242 |
writePrepBam(wbam, read, next_id, 'S'); |
243 |
continue; |
244 |
} |
245 |
if ((int)read.seq.length()<min_read_len) |
246 |
min_read_len=read.seq.length(); |
247 |
if ((int)read.seq.length()>max_read_len) |
248 |
max_read_len=read.seq.length(); |
249 |
|
250 |
// daehwan - check this later, it's due to bowtie |
251 |
if (color && read.seq[1] == '4') { |
252 |
++num_reads_chucked; |
253 |
writePrepBam(wbam, read, next_id, 'c'); |
254 |
continue; |
255 |
} |
256 |
|
257 |
if (readmap_loaded && check_readmap(next_id)) { |
258 |
++num_reads_chucked; |
259 |
++multimap_chucked; |
260 |
writePrepBam(wbam, read, next_id, 'M'); |
261 |
continue; |
262 |
} |
263 |
format_qual_string(read.qual); |
264 |
std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper); |
265 |
char counts[256]; |
266 |
memset(counts, 0, sizeof(counts)); |
267 |
// Count up the bad characters |
268 |
for (unsigned int i = 0; i != read.seq.length(); ++i) |
269 |
{ |
270 |
char c = (char)toupper(read.seq[i]); |
271 |
counts[(size_t)c]++; |
272 |
} |
273 |
|
274 |
double percent_A = (double)(counts[(size_t)'A']) / read.seq.length(); |
275 |
double percent_C = (double)(counts[(size_t)'C']) / read.seq.length(); |
276 |
double percent_G = (double)(counts[(size_t)'G']) / read.seq.length(); |
277 |
double percent_T = (double)(counts[(size_t)'T']) / read.seq.length(); |
278 |
double percent_N = (double)(counts[(size_t)'N']) / read.seq.length(); |
279 |
double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length(); |
280 |
|
281 |
// Chuck the read if there are at least 5 'N's or if it's mostly |
282 |
// (>90%) 'N's and 'A's |
283 |
char trash_code=0; |
284 |
if (percent_A > 0.9 || |
285 |
percent_C > 0.9 || |
286 |
percent_G > 0.9 || |
287 |
percent_T > 0.9) |
288 |
trash_code='L'; |
289 |
else if (percent_N >= 0.1 || percent_4 >=0.1) |
290 |
trash_code='N'; |
291 |
if (trash_code) { |
292 |
++num_reads_chucked; |
293 |
writePrepBam(wbam, read, next_id, trash_code); |
294 |
continue; |
295 |
} |
296 |
|
297 |
if (wbam) { |
298 |
writePrepBam(wbam, read, next_id); |
299 |
} |
300 |
else { |
301 |
// daehwan - we should not use buf in printf function |
302 |
// because it may contain some control characters such as "\" from quality values. |
303 |
// Here, buf is only used for calculating the file offset |
304 |
char buf[2048] = {0}; |
305 |
if (reads_format == FASTQ or (reads_format == FASTA && quals)) |
306 |
{ |
307 |
sprintf(buf, "@%u\n%s\n+%s\n%s\n", |
308 |
next_id, |
309 |
read.seq.c_str(), |
310 |
read.name.c_str(), |
311 |
read.qual.c_str()); |
312 |
|
313 |
fprintf(fout, "@%u\n%s\n+%s\n%s\n", |
314 |
next_id, |
315 |
read.seq.c_str(), |
316 |
read.name.c_str(), |
317 |
read.qual.c_str()); |
318 |
} |
319 |
else if (reads_format == FASTA) |
320 |
{ |
321 |
string qual; |
322 |
if (color) |
323 |
qual = string(read.seq.length()-1, 'I').c_str(); |
324 |
else |
325 |
qual = string(read.seq.length(), 'I').c_str(); |
326 |
|
327 |
sprintf(buf, "@%u\n%s\n+%s\n%s\n", |
328 |
next_id, |
329 |
read.seq.c_str(), |
330 |
read.name.c_str(), |
331 |
qual.c_str()); |
332 |
|
333 |
fprintf(fout, "@%u\n%s\n+%s\n%s\n", |
334 |
next_id, |
335 |
read.seq.c_str(), |
336 |
read.name.c_str(), |
337 |
qual.c_str()); |
338 |
} |
339 |
else |
340 |
{ |
341 |
assert(0); |
342 |
} |
343 |
|
344 |
if (findex != NULL) |
345 |
{ |
346 |
if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0) |
347 |
fprintf(findex, "%d\t%lu\n", next_id, file_offset); |
348 |
} |
349 |
|
350 |
file_offset += strlen(buf); |
351 |
} |
352 |
} //while !fr.isEof() |
353 |
//fr.close(); |
354 |
//frq.close(); |
355 |
} //for each input file |
356 |
fprintf(stderr, "%u out of %u reads have been filtered out\n", |
357 |
num_reads_chucked, next_id); |
358 |
if (readmap_loaded) |
359 |
fprintf(stderr, "\t(%u filtered out due to %s)\n", |
360 |
multimap_chucked, flt_reads.c_str()); |
361 |
if (wbam) { delete wbam; } |
362 |
if (fout && fout!=stdout) fclose(fout); |
363 |
if (fw!=NULL) { |
364 |
fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0)); |
365 |
fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0)); |
366 |
fprintf(fw, "reads_in =%d\n",next_id); |
367 |
fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked); |
368 |
fclose(fw); |
369 |
} |
370 |
|
371 |
if (findex != NULL) |
372 |
fclose(findex); |
373 |
} |
374 |
|
375 |
void print_usage() |
376 |
{ |
377 |
fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n"); |
378 |
//alternate usage (--ctv_to_num) : doesn't filter out any reads, |
379 |
//but simply convert read names to numeric ids |
380 |
} |
381 |
|
382 |
|
383 |
int main(int argc, char *argv[]) |
384 |
{ |
385 |
fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); |
386 |
fprintf(stderr, "---------------------------\n"); |
387 |
|
388 |
int parse_ret = parse_options(argc, argv, print_usage); |
389 |
if (parse_ret) |
390 |
return parse_ret; |
391 |
|
392 |
if(optind >= argc) |
393 |
{ |
394 |
print_usage(); |
395 |
return 1; |
396 |
} |
397 |
|
398 |
string reads_file_list = argv[optind++]; |
399 |
|
400 |
vector<string> reads_filenames; |
401 |
tokenize(reads_file_list, ",",reads_filenames); |
402 |
vector<string> quals_file_names; |
403 |
vector<FZPipe> quals_files; |
404 |
if (quals) |
405 |
{ |
406 |
string quals_file_list = argv[optind++]; |
407 |
tokenize(quals_file_list, ",", quals_file_names); |
408 |
for (size_t i = 0; i < quals_file_names.size(); ++i) |
409 |
{ |
410 |
FZPipe seg_file(quals_file_names[i], true); |
411 |
if (seg_file.file == NULL) |
412 |
{ |
413 |
fprintf(stderr, "Error: cannot open reads file %s for reading\n", |
414 |
quals_file_names[i].c_str()); |
415 |
exit(1); |
416 |
} |
417 |
quals_files.push_back(seg_file); |
418 |
} |
419 |
} |
420 |
load_readmap(); |
421 |
// Only print to standard out the good reads |
422 |
//TODO: a better, more generic read filtering protocol |
423 |
if (flt_mappings.empty()) |
424 |
process_reads(reads_filenames, quals_files); |
425 |
else //special use case: filter previous bowtie results |
426 |
flt_reads_and_hits(reads_filenames); |
427 |
|
428 |
return 0; |
429 |
} |