1 |
/* |
2 |
* polyA_reads.cpp |
3 |
* TopHat |
4 |
* |
5 |
* Created by Cole Trapnell on 9/2/08. |
6 |
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
* Derived from maq "catfilter", by Heng Li at Sanger |
8 |
*/ |
9 |
|
10 |
#ifdef HAVE_CONFIG_H |
11 |
#include <config.h> |
12 |
#endif |
13 |
|
14 |
#include <stdio.h> |
15 |
#include <cassert> |
16 |
#include <vector> |
17 |
#include <cstring> |
18 |
#include <cstdlib> |
19 |
|
20 |
#include "common.h" |
21 |
#include "reads.h" |
22 |
#include "tokenize.h" |
23 |
#include "qual.h" |
24 |
|
25 |
//bool fastq_db = true; |
26 |
|
27 |
using namespace std; |
28 |
|
29 |
void format_qual_string(string& qual_str) |
30 |
{ |
31 |
for (size_t i = 0; i < qual_str.size(); ++i) |
32 |
{ |
33 |
qual_str[i] = charToPhred33(qual_str[i], |
34 |
solexa_quals, |
35 |
phred64_quals); |
36 |
} |
37 |
} |
38 |
|
39 |
|
40 |
void filter_garbage_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files) |
41 |
{ |
42 |
|
43 |
int num_reads_chucked = 0, num_reads = 0; |
44 |
int min_read_len = 2000000; |
45 |
int max_read_len = 0; |
46 |
int next_id = 0; |
47 |
FILE* fw=NULL; |
48 |
if (!aux_outfile.empty()) { |
49 |
fw=fopen(aux_outfile.c_str(), "w"); |
50 |
if (fw==NULL) |
51 |
err_die("Error: cannot create file %s\n",aux_outfile.c_str()); |
52 |
} |
53 |
|
54 |
for (size_t fi = 0; fi < reads_files.size(); ++fi) |
55 |
{ |
56 |
Read read; |
57 |
FLineReader fr(reads_files[fi]); |
58 |
skip_lines(fr); |
59 |
|
60 |
FZPipe fq; |
61 |
if (quals) |
62 |
fq = quals_files[fi]; |
63 |
FLineReader frq(fq); |
64 |
skip_lines(frq); |
65 |
|
66 |
while (!fr.isEof()) { |
67 |
//read.clear(); |
68 |
// Get the next read from the file |
69 |
if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL))) |
70 |
break; |
71 |
if (read.seq.length()<12) { |
72 |
++num_reads_chucked; |
73 |
continue; |
74 |
} |
75 |
if ((int)read.seq.length()<min_read_len) min_read_len=read.seq.length(); |
76 |
if ((int)read.seq.length()>max_read_len) max_read_len=read.seq.length(); |
77 |
format_qual_string(read.qual); |
78 |
std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper); |
79 |
++num_reads; |
80 |
++next_id; |
81 |
char counts[256]; |
82 |
memset(counts, 0, sizeof(counts)); |
83 |
// Count up the bad characters |
84 |
for (unsigned int i = 0; i != read.seq.length(); ++i) |
85 |
{ |
86 |
char c = (char)toupper(read.seq[i]); |
87 |
counts[(size_t)c]++; |
88 |
} |
89 |
|
90 |
double percent_A = (double)(counts[(size_t)'A']) / read.seq.length(); |
91 |
double percent_C = (double)(counts[(size_t)'C']) / read.seq.length(); |
92 |
double percent_G = (double)(counts[(size_t)'G']) / read.seq.length(); |
93 |
double percent_T = (double)(counts[(size_t)'T']) / read.seq.length(); |
94 |
double percent_N = (double)(counts[(size_t)'N']) / read.seq.length(); |
95 |
double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length(); |
96 |
|
97 |
if (reads_format == FASTQ && |
98 |
read.seq.length() != read.qual.length()) |
99 |
{ |
100 |
++num_reads_chucked; |
101 |
continue; |
102 |
} |
103 |
|
104 |
// daehwan - check this later, it's due to bowtie |
105 |
if (color && read.seq[1] == '4') { |
106 |
continue; |
107 |
} |
108 |
// Chuck the read if there are at least 5 'N's or if it's mostly |
109 |
// (>90%) 'N's and 'A's |
110 |
|
111 |
if (percent_A > 0.9 || |
112 |
percent_C > 0.9 || |
113 |
percent_G > 0.9 || |
114 |
percent_T > 0.9 || |
115 |
percent_N >= 0.1 || |
116 |
percent_4 >= 0.1) |
117 |
{ |
118 |
++num_reads_chucked; |
119 |
} |
120 |
else |
121 |
{ |
122 |
/* if (!fastq_db) |
123 |
{ |
124 |
if (reads_format == FASTQ or (reads_format == FASTA && quals)) |
125 |
printf("@%s\n%s\n+\n%s\n", |
126 |
read.name.c_str(), read.seq.c_str(),read.qual.c_str()); |
127 |
else if (reads_format == FASTA) |
128 |
printf(">%s\n%s\n", read.name.c_str(), read.seq.c_str()); |
129 |
} |
130 |
else |
131 |
{ */ |
132 |
if (reads_format == FASTQ or (reads_format == FASTA && quals)) |
133 |
{ |
134 |
printf("@%d\n%s\n+%s\n%s\n", |
135 |
next_id, |
136 |
read.seq.c_str(), |
137 |
read.name.c_str(), |
138 |
read.qual.c_str()); |
139 |
} |
140 |
else if (reads_format == FASTA) |
141 |
{ |
142 |
string qual; |
143 |
if (color) { |
144 |
qual = "!"; |
145 |
qual += string(read.seq.length()-1, 'I').c_str(); |
146 |
} |
147 |
else |
148 |
qual = string(read.seq.length(), 'I').c_str(); |
149 |
|
150 |
printf("@%d\n%s\n+%s\n%s\n", |
151 |
next_id, |
152 |
read.seq.c_str(), |
153 |
read.name.c_str(), |
154 |
qual.c_str()); |
155 |
} |
156 |
//} only used if fastq_db is false (?) |
157 |
} |
158 |
} //while !fr.isEof() |
159 |
fr.close(); |
160 |
frq.close(); |
161 |
} //for each input file |
162 |
|
163 |
fprintf(stderr, "%d out of %d reads have been filtered out\n", |
164 |
num_reads_chucked, num_reads); |
165 |
if (fw!=NULL) { |
166 |
fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0)); |
167 |
fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0)); |
168 |
fprintf(fw, "reads_in =%d\n",num_reads); |
169 |
fprintf(fw, "reads_out=%d\n",num_reads-num_reads_chucked); |
170 |
fclose(fw); |
171 |
} |
172 |
} |
173 |
|
174 |
void print_usage() |
175 |
{ |
176 |
fprintf(stderr, "Usage: prep_reads <reads1.fa/fq,...,readsN.fa/fq>\n"); |
177 |
} |
178 |
|
179 |
|
180 |
int main(int argc, char *argv[]) |
181 |
{ |
182 |
fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); |
183 |
fprintf(stderr, "---------------------------\n"); |
184 |
|
185 |
int parse_ret = parse_options(argc, argv, print_usage); |
186 |
if (parse_ret) |
187 |
return parse_ret; |
188 |
|
189 |
if(optind >= argc) |
190 |
{ |
191 |
print_usage(); |
192 |
return 1; |
193 |
} |
194 |
|
195 |
string reads_file_list = argv[optind++]; |
196 |
|
197 |
vector<string> reads_file_names; |
198 |
vector<FZPipe> reads_files; |
199 |
tokenize(reads_file_list, ",",reads_file_names); |
200 |
for (size_t i = 0; i < reads_file_names.size(); ++i) |
201 |
{ |
202 |
string fname=reads_file_names[i]; |
203 |
string pipecmd=guess_packer(fname, true); |
204 |
if (!pipecmd.empty()) pipecmd.append(" -cd"); |
205 |
FZPipe seg_file(fname,pipecmd); |
206 |
if (seg_file.file == NULL) { |
207 |
fprintf(stderr, "Error: cannot open reads file %s for reading\n", |
208 |
reads_file_names[i].c_str()); |
209 |
exit(1); |
210 |
} |
211 |
reads_files.push_back(seg_file); |
212 |
} |
213 |
|
214 |
vector<string> quals_file_names; |
215 |
vector<FZPipe> quals_files; |
216 |
if (quals) |
217 |
{ |
218 |
string quals_file_list = argv[optind++]; |
219 |
tokenize(quals_file_list, ",", quals_file_names); |
220 |
for (size_t i = 0; i < quals_file_names.size(); ++i) |
221 |
{ |
222 |
string fname(quals_file_names[i]); |
223 |
string pipecmd=guess_packer(fname, true); |
224 |
FZPipe seg_file(fname, pipecmd); |
225 |
if (seg_file.file == NULL) |
226 |
{ |
227 |
fprintf(stderr, "Error: cannot open reads file %s for reading\n", |
228 |
quals_file_names[i].c_str()); |
229 |
exit(1); |
230 |
} |
231 |
quals_files.push_back(seg_file); |
232 |
} |
233 |
} |
234 |
|
235 |
// Only print to standard out the good reads |
236 |
//TODO: a better, more generic read filtering protocol |
237 |
filter_garbage_reads(reads_files, quals_files); |
238 |
|
239 |
return 0; |
240 |
} |