1 |
gpertea |
29 |
/* |
2 |
|
|
* polyA_reads.cpp |
3 |
|
|
* TopHat |
4 |
|
|
* |
5 |
|
|
* Created by Cole Trapnell on 9/2/08. |
6 |
|
|
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
|
|
* Derived from maq "catfilter", by Heng Li at Sanger |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
#ifdef HAVE_CONFIG_H |
11 |
|
|
#include <config.h> |
12 |
|
|
#endif |
13 |
|
|
|
14 |
|
|
#include <stdio.h> |
15 |
|
|
#include <cassert> |
16 |
|
|
#include <vector> |
17 |
|
|
#include <cstring> |
18 |
|
|
#include <cstdlib> |
19 |
|
|
|
20 |
|
|
#include "common.h" |
21 |
|
|
#include "reads.h" |
22 |
|
|
#include "tokenize.h" |
23 |
|
|
#include "qual.h" |
24 |
|
|
|
25 |
|
|
//bool fastq_db = true; |
26 |
|
|
|
27 |
|
|
using namespace std; |
28 |
|
|
|
29 |
|
|
void format_qual_string(string& qual_str) |
30 |
|
|
{ |
31 |
|
|
for (size_t i = 0; i < qual_str.size(); ++i) |
32 |
|
|
{ |
33 |
|
|
qual_str[i] = charToPhred33(qual_str[i], |
34 |
|
|
solexa_quals, |
35 |
|
|
phred64_quals); |
36 |
|
|
} |
37 |
|
|
} |
38 |
|
|
|
39 |
gpertea |
135 |
bool readmap_loaded=false; |
40 |
|
|
vector<bool> readmap; //for filter_multihits |
41 |
gpertea |
29 |
|
42 |
gpertea |
135 |
void load_readmap() { |
43 |
|
|
readmap_loaded=false; |
44 |
|
|
if (flt_reads.empty()) return; |
45 |
|
|
//FZPipe(filter_multihits, false); |
46 |
gpertea |
218 |
ReadStream rdstream(flt_reads, NULL, true); |
47 |
|
|
/* |
48 |
gpertea |
135 |
FILE* rfile=fopen(flt_reads.c_str(),"r"); |
49 |
|
|
if (rfile==NULL) |
50 |
|
|
err_die("Error: cannot read file %s\n", flt_reads.c_str()); |
51 |
|
|
FLineReader fr(rfile); |
52 |
|
|
skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this |
53 |
gpertea |
218 |
*/ |
54 |
gpertea |
135 |
Read read; |
55 |
gpertea |
218 |
//while (next_fastx_read(fr, read, reads_format)) { |
56 |
|
|
while (rdstream.get_direct(read, reads_format)) { |
57 |
gpertea |
135 |
uint32_t rnum=(uint32_t)atol(read.name.c_str()); |
58 |
|
|
if (rnum>=(uint32_t) readmap.size()) |
59 |
|
|
readmap.resize(rnum+1, false); |
60 |
|
|
readmap[rnum] = true; |
61 |
|
|
} |
62 |
|
|
readmap_loaded=true; |
63 |
gpertea |
218 |
//fclose(rfile); |
64 |
gpertea |
135 |
} |
65 |
|
|
|
66 |
|
|
bool check_readmap(uint32_t read_id) { |
67 |
|
|
if (read_id>=readmap.size()) return false; |
68 |
|
|
else return readmap[read_id]; |
69 |
|
|
} |
70 |
gpertea |
217 |
|
71 |
|
|
void flt_reads_and_hits(vector<string>& reads_files) { |
72 |
gpertea |
135 |
if (!readmap_loaded) |
73 |
|
|
err_die("Error: filtering reads not enabled, aborting."); |
74 |
|
|
if (aux_outfile.empty()) |
75 |
|
|
err_die("Error: auxiliary output file not provided."); |
76 |
gpertea |
217 |
//if (index_outfile.empty()) |
77 |
|
|
// err_die("Error: index output file not provided."); |
78 |
gpertea |
154 |
|
79 |
gpertea |
135 |
// -- filter mappings: |
80 |
|
|
string fext=getFext(flt_mappings); |
81 |
|
|
if (fext=="bam") { |
82 |
gpertea |
218 |
samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0); |
83 |
|
|
if (fbam==NULL) |
84 |
|
|
err_die("Error opening BAM file %s!\n", flt_mappings.c_str()); |
85 |
|
|
bam1_t *b = bam_init1(); |
86 |
|
|
string aux_index(aux_outfile); |
87 |
|
|
aux_index+=".index"; |
88 |
|
|
GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index); |
89 |
|
|
while (samread(fbam, b) > 0) { |
90 |
|
|
char* rname = bam1_qname(b); |
91 |
|
|
uint32_t rid=(uint32_t)atol(rname); |
92 |
|
|
if (check_readmap(rid)) { |
93 |
|
|
//write this mapping into the output file |
94 |
|
|
wbam.write(b, rid); |
95 |
|
|
} |
96 |
|
|
} |
97 |
|
|
bam_destroy1(b); |
98 |
|
|
samclose(fbam); |
99 |
gpertea |
135 |
} |
100 |
gpertea |
218 |
else { |
101 |
|
|
bool is_sam=false; |
102 |
|
|
string s(flt_mappings); |
103 |
|
|
for(size_t i=0; i!=s.length(); ++i) |
104 |
|
|
s[i] = std::tolower(s[i]); |
105 |
|
|
if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true; |
106 |
|
|
string unzcmd=getUnpackCmd(flt_mappings); |
107 |
|
|
FZPipe hitsfile(flt_mappings, unzcmd); |
108 |
|
|
FLineReader fh(hitsfile); |
109 |
|
|
FZPipe outfile; |
110 |
|
|
outfile.openWrite(aux_outfile.c_str(), zpacker); |
111 |
|
|
if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str()); |
112 |
|
|
const char* line; |
113 |
|
|
while ((line=fh.nextLine())) { |
114 |
|
|
if (is_sam && line[0]=='@') { |
115 |
|
|
//copy the header |
116 |
|
|
fprintf(outfile.file, "%s\n", line); |
117 |
|
|
continue; |
118 |
|
|
} |
119 |
|
|
char* tab=strchr((char*)line, '\t'); |
120 |
|
|
if (tab==NULL) |
121 |
|
|
err_die("Error: cannot find tab character in %s mappings line:\n%s", |
122 |
|
|
flt_mappings.c_str(),line); |
123 |
|
|
*tab=0; |
124 |
|
|
uint32_t rid = (uint32_t) atol(line); |
125 |
|
|
if (rid==0) |
126 |
|
|
err_die("Error: invalid read ID (%s) parsed from mapping file %s", |
127 |
|
|
line, flt_mappings.c_str()); |
128 |
|
|
*tab='\t'; |
129 |
|
|
if (check_readmap(rid)) { |
130 |
|
|
fprintf(outfile.file, "%s\n", line); |
131 |
|
|
} |
132 |
|
|
} |
133 |
|
|
outfile.close(); |
134 |
|
|
hitsfile.close(); |
135 |
|
|
} |
136 |
gpertea |
135 |
// -- now filter reads |
137 |
|
|
for (size_t fi = 0; fi < reads_files.size(); ++fi) { |
138 |
|
|
//only one file expected here, this is not the initial prep_reads |
139 |
|
|
Read read; |
140 |
gpertea |
217 |
ReadStream readstream(reads_files[fi], NULL, true); |
141 |
gpertea |
135 |
//skip_lines(fr); |
142 |
gpertea |
217 |
while (readstream.get_direct(read)) { |
143 |
gpertea |
135 |
uint32_t rnum=(uint32_t)atol(read.name.c_str()); |
144 |
|
|
if (check_readmap(rnum)) |
145 |
|
|
printf("@%s\n%s\n+%s\n%s\n", |
146 |
|
|
read.name.c_str(), |
147 |
|
|
read.seq.c_str(), |
148 |
|
|
read.alt_name.c_str(), |
149 |
|
|
read.qual.c_str()); |
150 |
|
|
} |
151 |
|
|
} |
152 |
|
|
} |
153 |
|
|
|
154 |
|
|
|
155 |
gpertea |
218 |
void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) { |
156 |
|
|
if (wbam==NULL) return; |
157 |
gpertea |
220 |
string rnum; |
158 |
|
|
str_appendInt(rnum,rid); |
159 |
|
|
string rname(read.name); |
160 |
|
|
size_t pl=rname.length()-1; |
161 |
|
|
GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str()); |
162 |
|
|
int matenum=0; |
163 |
|
|
if (rname[pl-1]=='/') { |
164 |
|
|
if (rname[pl]=='1') { |
165 |
|
|
bamrec.set_flag(BAM_FREAD1); |
166 |
|
|
matenum=1; |
167 |
|
|
} |
168 |
|
|
else if (rname[pl]=='2') { |
169 |
|
|
bamrec.set_flag(BAM_FREAD2); |
170 |
|
|
matenum=2; |
171 |
|
|
} |
172 |
|
|
if (matenum) rname.resize(pl-1); |
173 |
|
|
} |
174 |
|
|
|
175 |
|
|
bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str()); |
176 |
gpertea |
218 |
if (trashcode) { |
177 |
gpertea |
219 |
bamrec.set_flag(BAM_FQCFAIL); |
178 |
gpertea |
218 |
bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode); |
179 |
|
|
} |
180 |
|
|
wbam->write(bamrec.get_b(), rid); |
181 |
|
|
} |
182 |
|
|
|
183 |
gpertea |
217 |
void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files) |
184 |
gpertea |
29 |
{ |
185 |
gpertea |
135 |
//TODO: add the option to write the garbage reads into separate file(s) |
186 |
|
|
int num_reads_chucked = 0; |
187 |
gpertea |
154 |
int multimap_chucked = 0; |
188 |
gpertea |
135 |
int min_read_len = 20000000; |
189 |
gpertea |
29 |
int max_read_len = 0; |
190 |
gpertea |
135 |
uint32_t next_id = 0; |
191 |
gpertea |
154 |
uint64_t file_offset = 0; |
192 |
gpertea |
29 |
FILE* fw=NULL; |
193 |
|
|
if (!aux_outfile.empty()) { |
194 |
|
|
fw=fopen(aux_outfile.c_str(), "w"); |
195 |
|
|
if (fw==NULL) |
196 |
|
|
err_die("Error: cannot create file %s\n",aux_outfile.c_str()); |
197 |
|
|
} |
198 |
|
|
|
199 |
gpertea |
154 |
FILE* findex = NULL; |
200 |
gpertea |
217 |
GBamWriter* wbam=NULL; |
201 |
|
|
FILE* fout=NULL; |
202 |
|
|
if (std_outfile.empty()) { |
203 |
|
|
fout=stdout; |
204 |
|
|
} |
205 |
|
|
else { //output file name explicitely given |
206 |
|
|
if (getFext(std_outfile)=="bam") { |
207 |
|
|
if (sam_header.empty()) err_die("Error: sam header file not provided.\n"); |
208 |
|
|
wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile); |
209 |
gpertea |
218 |
//wbam = new GBamWriter(std_outfile, index_outfile); |
210 |
gpertea |
217 |
} |
211 |
|
|
else { |
212 |
|
|
fout = fopen(std_outfile.c_str(), "w"); |
213 |
|
|
if (fout==NULL) |
214 |
|
|
err_die("Error: cannot create file %s\n", std_outfile.c_str()); |
215 |
|
|
} |
216 |
|
|
} |
217 |
|
|
if (wbam==NULL && !index_outfile.empty()) { |
218 |
gpertea |
154 |
findex = fopen(index_outfile.c_str(), "w"); |
219 |
|
|
if (findex == NULL) |
220 |
|
|
err_die("Error: cannot create file %s\n", index_outfile.c_str()); |
221 |
|
|
} |
222 |
|
|
|
223 |
gpertea |
29 |
for (size_t fi = 0; fi < reads_files.size(); ++fi) |
224 |
|
|
{ |
225 |
|
|
Read read; |
226 |
gpertea |
207 |
FZPipe* fq=NULL; |
227 |
gpertea |
29 |
if (quals) |
228 |
gpertea |
207 |
fq = & quals_files[fi]; |
229 |
gpertea |
217 |
ReadStream reads(reads_files[fi], fq, true); |
230 |
gpertea |
29 |
|
231 |
gpertea |
207 |
while (reads.get_direct(read, reads_format)) { |
232 |
|
|
// read.clear(); |
233 |
gpertea |
29 |
// Get the next read from the file |
234 |
gpertea |
207 |
//if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL))) |
235 |
|
|
// break; |
236 |
gpertea |
135 |
|
237 |
|
|
++next_id; |
238 |
|
|
//IMPORTANT: to keep paired reads in sync, this must be |
239 |
|
|
//incremented BEFORE any reads are chucked ! |
240 |
gpertea |
29 |
if (read.seq.length()<12) { |
241 |
|
|
++num_reads_chucked; |
242 |
gpertea |
218 |
writePrepBam(wbam, read, next_id, 'S'); |
243 |
gpertea |
29 |
continue; |
244 |
|
|
} |
245 |
gpertea |
135 |
if ((int)read.seq.length()<min_read_len) |
246 |
|
|
min_read_len=read.seq.length(); |
247 |
|
|
if ((int)read.seq.length()>max_read_len) |
248 |
|
|
max_read_len=read.seq.length(); |
249 |
|
|
|
250 |
|
|
// daehwan - check this later, it's due to bowtie |
251 |
|
|
if (color && read.seq[1] == '4') { |
252 |
|
|
++num_reads_chucked; |
253 |
gpertea |
218 |
writePrepBam(wbam, read, next_id, 'c'); |
254 |
gpertea |
135 |
continue; |
255 |
|
|
} |
256 |
|
|
|
257 |
|
|
if (readmap_loaded && check_readmap(next_id)) { |
258 |
|
|
++num_reads_chucked; |
259 |
|
|
++multimap_chucked; |
260 |
gpertea |
218 |
writePrepBam(wbam, read, next_id, 'M'); |
261 |
gpertea |
135 |
continue; |
262 |
|
|
} |
263 |
gpertea |
207 |
format_qual_string(read.qual); |
264 |
gpertea |
29 |
std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper); |
265 |
|
|
char counts[256]; |
266 |
|
|
memset(counts, 0, sizeof(counts)); |
267 |
|
|
// Count up the bad characters |
268 |
|
|
for (unsigned int i = 0; i != read.seq.length(); ++i) |
269 |
|
|
{ |
270 |
|
|
char c = (char)toupper(read.seq[i]); |
271 |
|
|
counts[(size_t)c]++; |
272 |
|
|
} |
273 |
|
|
|
274 |
|
|
double percent_A = (double)(counts[(size_t)'A']) / read.seq.length(); |
275 |
|
|
double percent_C = (double)(counts[(size_t)'C']) / read.seq.length(); |
276 |
|
|
double percent_G = (double)(counts[(size_t)'G']) / read.seq.length(); |
277 |
|
|
double percent_T = (double)(counts[(size_t)'T']) / read.seq.length(); |
278 |
|
|
double percent_N = (double)(counts[(size_t)'N']) / read.seq.length(); |
279 |
|
|
double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length(); |
280 |
|
|
|
281 |
|
|
// Chuck the read if there are at least 5 'N's or if it's mostly |
282 |
|
|
// (>90%) 'N's and 'A's |
283 |
gpertea |
218 |
char trash_code=0; |
284 |
gpertea |
220 |
if (percent_A > 0.8 || |
285 |
|
|
percent_C > 0.8 || |
286 |
|
|
percent_G > 0.8 || |
287 |
|
|
percent_T > 0.8) |
288 |
gpertea |
218 |
trash_code='L'; |
289 |
|
|
else if (percent_N >= 0.1 || percent_4 >=0.1) |
290 |
|
|
trash_code='N'; |
291 |
|
|
if (trash_code) { |
292 |
|
|
++num_reads_chucked; |
293 |
|
|
writePrepBam(wbam, read, next_id, trash_code); |
294 |
|
|
continue; |
295 |
|
|
} |
296 |
gpertea |
135 |
|
297 |
gpertea |
218 |
if (wbam) { |
298 |
|
|
writePrepBam(wbam, read, next_id); |
299 |
|
|
} |
300 |
|
|
else { |
301 |
|
|
// daehwan - we should not use buf in printf function |
302 |
|
|
// because it may contain some control characters such as "\" from quality values. |
303 |
|
|
// Here, buf is only used for calculating the file offset |
304 |
|
|
char buf[2048] = {0}; |
305 |
|
|
if (reads_format == FASTQ or (reads_format == FASTA && quals)) |
306 |
|
|
{ |
307 |
|
|
sprintf(buf, "@%u\n%s\n+%s\n%s\n", |
308 |
|
|
next_id, |
309 |
|
|
read.seq.c_str(), |
310 |
|
|
read.name.c_str(), |
311 |
|
|
read.qual.c_str()); |
312 |
gpertea |
29 |
|
313 |
gpertea |
218 |
fprintf(fout, "@%u\n%s\n+%s\n%s\n", |
314 |
|
|
next_id, |
315 |
|
|
read.seq.c_str(), |
316 |
|
|
read.name.c_str(), |
317 |
|
|
read.qual.c_str()); |
318 |
|
|
} |
319 |
|
|
else if (reads_format == FASTA) |
320 |
|
|
{ |
321 |
|
|
string qual; |
322 |
|
|
if (color) |
323 |
|
|
qual = string(read.seq.length()-1, 'I').c_str(); |
324 |
|
|
else |
325 |
|
|
qual = string(read.seq.length(), 'I').c_str(); |
326 |
gpertea |
154 |
|
327 |
gpertea |
218 |
sprintf(buf, "@%u\n%s\n+%s\n%s\n", |
328 |
|
|
next_id, |
329 |
|
|
read.seq.c_str(), |
330 |
|
|
read.name.c_str(), |
331 |
|
|
qual.c_str()); |
332 |
gpertea |
154 |
|
333 |
gpertea |
218 |
fprintf(fout, "@%u\n%s\n+%s\n%s\n", |
334 |
|
|
next_id, |
335 |
|
|
read.seq.c_str(), |
336 |
|
|
read.name.c_str(), |
337 |
|
|
qual.c_str()); |
338 |
|
|
} |
339 |
|
|
else |
340 |
|
|
{ |
341 |
|
|
assert(0); |
342 |
|
|
} |
343 |
gpertea |
154 |
|
344 |
gpertea |
218 |
if (findex != NULL) |
345 |
|
|
{ |
346 |
gpertea |
219 |
if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0) |
347 |
gpertea |
218 |
fprintf(findex, "%d\t%lu\n", next_id, file_offset); |
348 |
|
|
} |
349 |
|
|
|
350 |
|
|
file_offset += strlen(buf); |
351 |
gpertea |
29 |
} |
352 |
|
|
} //while !fr.isEof() |
353 |
gpertea |
207 |
//fr.close(); |
354 |
|
|
//frq.close(); |
355 |
gpertea |
29 |
} //for each input file |
356 |
gpertea |
135 |
fprintf(stderr, "%u out of %u reads have been filtered out\n", |
357 |
|
|
num_reads_chucked, next_id); |
358 |
|
|
if (readmap_loaded) |
359 |
|
|
fprintf(stderr, "\t(%u filtered out due to %s)\n", |
360 |
|
|
multimap_chucked, flt_reads.c_str()); |
361 |
gpertea |
217 |
if (wbam) { delete wbam; } |
362 |
gpertea |
218 |
if (fout && fout!=stdout) fclose(fout); |
363 |
gpertea |
29 |
if (fw!=NULL) { |
364 |
|
|
fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0)); |
365 |
|
|
fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0)); |
366 |
gpertea |
135 |
fprintf(fw, "reads_in =%d\n",next_id); |
367 |
|
|
fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked); |
368 |
gpertea |
29 |
fclose(fw); |
369 |
|
|
} |
370 |
gpertea |
154 |
|
371 |
|
|
if (findex != NULL) |
372 |
|
|
fclose(findex); |
373 |
gpertea |
29 |
} |
374 |
|
|
|
375 |
|
|
void print_usage() |
376 |
|
|
{ |
377 |
gpertea |
135 |
fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n"); |
378 |
|
|
//alternate usage (--ctv_to_num) : doesn't filter out any reads, |
379 |
|
|
//but simply convert read names to numeric ids |
380 |
gpertea |
29 |
} |
381 |
|
|
|
382 |
|
|
|
383 |
|
|
int main(int argc, char *argv[]) |
384 |
|
|
{ |
385 |
|
|
fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); |
386 |
|
|
fprintf(stderr, "---------------------------\n"); |
387 |
|
|
|
388 |
|
|
int parse_ret = parse_options(argc, argv, print_usage); |
389 |
|
|
if (parse_ret) |
390 |
|
|
return parse_ret; |
391 |
|
|
|
392 |
|
|
if(optind >= argc) |
393 |
|
|
{ |
394 |
|
|
print_usage(); |
395 |
|
|
return 1; |
396 |
|
|
} |
397 |
|
|
|
398 |
|
|
string reads_file_list = argv[optind++]; |
399 |
|
|
|
400 |
gpertea |
217 |
vector<string> reads_filenames; |
401 |
|
|
tokenize(reads_file_list, ",",reads_filenames); |
402 |
gpertea |
29 |
vector<string> quals_file_names; |
403 |
|
|
vector<FZPipe> quals_files; |
404 |
|
|
if (quals) |
405 |
|
|
{ |
406 |
|
|
string quals_file_list = argv[optind++]; |
407 |
|
|
tokenize(quals_file_list, ",", quals_file_names); |
408 |
|
|
for (size_t i = 0; i < quals_file_names.size(); ++i) |
409 |
|
|
{ |
410 |
gpertea |
211 |
FZPipe seg_file(quals_file_names[i], true); |
411 |
gpertea |
29 |
if (seg_file.file == NULL) |
412 |
|
|
{ |
413 |
|
|
fprintf(stderr, "Error: cannot open reads file %s for reading\n", |
414 |
|
|
quals_file_names[i].c_str()); |
415 |
|
|
exit(1); |
416 |
|
|
} |
417 |
|
|
quals_files.push_back(seg_file); |
418 |
|
|
} |
419 |
|
|
} |
420 |
gpertea |
135 |
load_readmap(); |
421 |
gpertea |
29 |
// Only print to standard out the good reads |
422 |
|
|
//TODO: a better, more generic read filtering protocol |
423 |
gpertea |
135 |
if (flt_mappings.empty()) |
424 |
gpertea |
217 |
process_reads(reads_filenames, quals_files); |
425 |
gpertea |
135 |
else //special use case: filter previous bowtie results |
426 |
gpertea |
217 |
flt_reads_and_hits(reads_filenames); |
427 |
gpertea |
29 |
|
428 |
|
|
return 0; |
429 |
|
|
} |