ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 68
Committed: Mon Sep 19 20:09:26 2011 UTC (13 years ago) by gpertea
File size: 6934 byte(s)
Log Message:
first attempt at accepting different Color qvlen in prep_reads

Line File contents
1 /*
2 * polyA_reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 * Derived from maq "catfilter", by Heng Li at Sanger
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <stdio.h>
15 #include <cassert>
16 #include <vector>
17 #include <cstring>
18 #include <cstdlib>
19
20 #include "common.h"
21 #include "reads.h"
22 #include "tokenize.h"
23 #include "qual.h"
24
25 //bool fastq_db = true;
26
27 using namespace std;
28
29 void format_qual_string(string& qual_str)
30 {
31 for (size_t i = 0; i < qual_str.size(); ++i)
32 {
33 qual_str[i] = charToPhred33(qual_str[i],
34 solexa_quals,
35 phred64_quals);
36 }
37 }
38
39
40 void filter_garbage_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
41 {
42
43 int num_reads_chucked = 0, num_reads = 0;
44 int min_read_len = 2000000;
45 int max_read_len = 0;
46 int next_id = 0;
47 FILE* fw=NULL;
48 if (!aux_outfile.empty()) {
49 fw=fopen(aux_outfile.c_str(), "w");
50 if (fw==NULL)
51 err_die("Error: cannot create file %s\n",aux_outfile.c_str());
52 }
53
54 for (size_t fi = 0; fi < reads_files.size(); ++fi)
55 {
56 Read read;
57 FLineReader fr(reads_files[fi]);
58 skip_lines(fr);
59
60 FZPipe fq;
61 if (quals)
62 fq = quals_files[fi];
63 FLineReader frq(fq);
64 skip_lines(frq);
65
66 while (!fr.isEof()) {
67 //read.clear();
68 // Get the next read from the file
69 if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
70 break;
71 if (read.seq.length()<12) {
72 ++num_reads_chucked;
73 continue;
74 }
75 if ((int)read.seq.length()<min_read_len) min_read_len=read.seq.length();
76 if ((int)read.seq.length()>max_read_len) max_read_len=read.seq.length();
77 format_qual_string(read.qual);
78 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
79 ++num_reads;
80 ++next_id;
81 char counts[256];
82 memset(counts, 0, sizeof(counts));
83 // Count up the bad characters
84 for (unsigned int i = 0; i != read.seq.length(); ++i)
85 {
86 char c = (char)toupper(read.seq[i]);
87 counts[(size_t)c]++;
88 }
89
90 double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
91 double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
92 double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
93 double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
94 double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
95 double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
96
97 if (reads_format == FASTQ &&
98 read.seq.length() != read.qual.length())
99 {
100 ++num_reads_chucked;
101 continue;
102 }
103
104 // daehwan - check this later, it's due to bowtie
105 if (color && read.seq[1] == '4') {
106 continue;
107 }
108 // Chuck the read if there are at least 5 'N's or if it's mostly
109 // (>90%) 'N's and 'A's
110
111 if (percent_A > 0.9 ||
112 percent_C > 0.9 ||
113 percent_G > 0.9 ||
114 percent_T > 0.9 ||
115 percent_N >= 0.1 ||
116 percent_4 >= 0.1)
117 {
118 ++num_reads_chucked;
119 }
120 else
121 {
122 /* if (!fastq_db)
123 {
124 if (reads_format == FASTQ or (reads_format == FASTA && quals))
125 printf("@%s\n%s\n+\n%s\n",
126 read.name.c_str(), read.seq.c_str(),read.qual.c_str());
127 else if (reads_format == FASTA)
128 printf(">%s\n%s\n", read.name.c_str(), read.seq.c_str());
129 }
130 else
131 { */
132 if (reads_format == FASTQ or (reads_format == FASTA && quals))
133 {
134 printf("@%d\n%s\n+%s\n%s\n",
135 next_id,
136 read.seq.c_str(),
137 read.name.c_str(),
138 read.qual.c_str());
139 }
140 else if (reads_format == FASTA)
141 {
142 string qual;
143 if (color) {
144 qual = "!";
145 qual += string(read.seq.length()-1, 'I').c_str();
146 }
147 else
148 qual = string(read.seq.length(), 'I').c_str();
149
150 printf("@%d\n%s\n+%s\n%s\n",
151 next_id,
152 read.seq.c_str(),
153 read.name.c_str(),
154 qual.c_str());
155 }
156 //} only used if fastq_db is false (?)
157 }
158 } //while !fr.isEof()
159 fr.close();
160 frq.close();
161 } //for each input file
162
163 fprintf(stderr, "%d out of %d reads have been filtered out\n",
164 num_reads_chucked, num_reads);
165 if (fw!=NULL) {
166 fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
167 fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
168 fprintf(fw, "reads_in =%d\n",num_reads);
169 fprintf(fw, "reads_out=%d\n",num_reads-num_reads_chucked);
170 fclose(fw);
171 }
172 }
173
174 void print_usage()
175 {
176 fprintf(stderr, "Usage: prep_reads <reads1.fa/fq,...,readsN.fa/fq>\n");
177 }
178
179
180 int main(int argc, char *argv[])
181 {
182 fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
183 fprintf(stderr, "---------------------------\n");
184
185 int parse_ret = parse_options(argc, argv, print_usage);
186 if (parse_ret)
187 return parse_ret;
188
189 if(optind >= argc)
190 {
191 print_usage();
192 return 1;
193 }
194
195 string reads_file_list = argv[optind++];
196
197 vector<string> reads_file_names;
198 vector<FZPipe> reads_files;
199 tokenize(reads_file_list, ",",reads_file_names);
200 for (size_t i = 0; i < reads_file_names.size(); ++i)
201 {
202 string fname=reads_file_names[i];
203 string pipecmd=guess_packer(fname, true);
204 if (!pipecmd.empty()) pipecmd.append(" -cd");
205 FZPipe seg_file(fname,pipecmd);
206 if (seg_file.file == NULL) {
207 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
208 reads_file_names[i].c_str());
209 exit(1);
210 }
211 reads_files.push_back(seg_file);
212 }
213
214 vector<string> quals_file_names;
215 vector<FZPipe> quals_files;
216 if (quals)
217 {
218 string quals_file_list = argv[optind++];
219 tokenize(quals_file_list, ",", quals_file_names);
220 for (size_t i = 0; i < quals_file_names.size(); ++i)
221 {
222 string fname(quals_file_names[i]);
223 string pipecmd=guess_packer(fname, true);
224 FZPipe seg_file(fname, pipecmd);
225 if (seg_file.file == NULL)
226 {
227 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
228 quals_file_names[i].c_str());
229 exit(1);
230 }
231 quals_files.push_back(seg_file);
232 }
233 }
234
235 // Only print to standard out the good reads
236 //TODO: a better, more generic read filtering protocol
237 filter_garbage_reads(reads_files, quals_files);
238
239 return 0;
240 }