ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
(Generate patch)
# Line 36 | Line 36
36      }
37   }
38  
39 + bool readmap_loaded=false;
40 + vector<bool> readmap; //for filter_multihits
41  
40 void filter_garbage_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
41 {      
42  
43 <  int num_reads_chucked = 0, num_reads = 0;
44 <  int min_read_len = 2000000;
43 > void load_readmap() {
44 >  readmap_loaded=false;
45 >  if (flt_reads.empty()) return;
46 >  //FZPipe(filter_multihits, false);
47 >  FILE* rfile=fopen(flt_reads.c_str(),"r");
48 >  if (rfile==NULL)
49 >     err_die("Error: cannot read file %s\n", flt_reads.c_str());
50 >  FLineReader fr(rfile);
51 >  skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
52 >  Read read;
53 >  while (next_fastx_read(fr, read, reads_format)) {
54 >    uint32_t rnum=(uint32_t)atol(read.name.c_str());
55 >    if (rnum>=(uint32_t) readmap.size())
56 >         readmap.resize(rnum+1, false);
57 >    readmap[rnum] = true;
58 >    }
59 >  readmap_loaded=true;
60 >  fclose(rfile);
61 > }
62 >
63 > bool check_readmap(uint32_t read_id) {
64 > if (read_id>=readmap.size()) return false;
65 >    else return readmap[read_id];
66 > }
67 > void flt_reads_and_hits(vector<FZPipe>& reads_files) {
68 >  if (!readmap_loaded)
69 >      err_die("Error: filtering reads not enabled, aborting.");
70 >  if (aux_outfile.empty())
71 >       err_die("Error: auxiliary output file not provided.");
72 >  // -- filter mappings:
73 >  string fext=getFext(flt_mappings);
74 >  if (fext=="bam") {
75 >    //TODO
76 >    //WRITEME: use samtools C API to write a quick bam filter
77 >    err_die("Error: cannot handle BAM files yet![WRITEME]");
78 >    return;
79 >    }
80 >  bool is_sam=false;
81 >  string s(flt_mappings);
82 >  for(size_t i=0; i!=s.length(); ++i)
83 >          s[i] = std::tolower(s[i]);
84 >  if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
85 >  string unzcmd=getUnpackCmd(flt_mappings);
86 >  FZPipe hitsfile(flt_mappings, unzcmd);
87 >  FLineReader fh(hitsfile);
88 >  FZPipe outfile;
89 >  outfile.openWrite(aux_outfile.c_str(), zpacker);
90 >  if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
91 >  const char* line;
92 >  while ((line=fh.nextLine())) {
93 >    if (is_sam && line[0]=='@') {
94 >        fprintf(outfile.file, "%s\n", line);
95 >        continue;
96 >        }
97 >    char* tab=strchr((char*)line, '\t');
98 >    if (tab==NULL)
99 >        err_die("Error: cannot find tab character in %s mappings line:\n%s",
100 >            flt_mappings.c_str(),line);
101 >    *tab=0;
102 >    uint32_t rid = (uint32_t) atol(line);
103 >    if (rid==0)
104 >      err_die("Error: invalid read ID (%s) parsed from mapping file %s",
105 >            line, flt_mappings.c_str());
106 >    *tab='\t';
107 >    if (check_readmap(rid)) {
108 >       fprintf(outfile.file, "%s\n", line);
109 >       }
110 >    }
111 >  outfile.close();
112 >  hitsfile.close();
113 >  // -- now filter reads
114 >  for (size_t fi = 0; fi < reads_files.size(); ++fi) {
115 >      //only one file expected here, this is not the initial prep_reads
116 >      Read read;
117 >      FLineReader fr(reads_files[fi]);
118 >      //skip_lines(fr);
119 >      while (next_fastx_read(fr, read)) {
120 >        uint32_t rnum=(uint32_t)atol(read.name.c_str());
121 >        if (check_readmap(rnum))
122 >          printf("@%s\n%s\n+%s\n%s\n",
123 >             read.name.c_str(),
124 >             read.seq.c_str(),
125 >             read.alt_name.c_str(),
126 >             read.qual.c_str());
127 >        }
128 >    }
129 > }
130 >
131 >
132 > void process_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
133 > {      
134 >   //TODO: add the option to write the garbage reads into separate file(s)
135 >  int num_reads_chucked = 0;
136 >  int multimap_chucked=0;
137 >  int min_read_len = 20000000;
138    int max_read_len = 0;
139 <  int next_id = 0;
139 >  uint32_t next_id = 0;
140    FILE* fw=NULL;
141    if (!aux_outfile.empty()) {
142      fw=fopen(aux_outfile.c_str(), "w");
# Line 56 | Line 149
149        Read read;
150        FLineReader fr(reads_files[fi]);
151        skip_lines(fr);
59      
152        FZPipe fq;
153        if (quals)
154 <            fq = quals_files[fi];
154 >              fq = quals_files[fi];
155        FLineReader frq(fq);
156        skip_lines(frq);
157        
# Line 68 | Line 160
160              // Get the next read from the file
161          if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
162              break;
163 +
164 +        ++next_id;
165 +        //IMPORTANT: to keep paired reads in sync, this must be
166 +        //incremented BEFORE any reads are chucked !
167 +
168          if (read.seq.length()<12) {
169              ++num_reads_chucked;
170              continue;
171              }
172 <        if ((int)read.seq.length()<min_read_len) min_read_len=read.seq.length();
173 <        if ((int)read.seq.length()>max_read_len) max_read_len=read.seq.length();
174 <            format_qual_string(read.qual);
172 >        if ((int)read.seq.length()<min_read_len)
173 >             min_read_len=read.seq.length();
174 >        if ((int)read.seq.length()>max_read_len)
175 >             max_read_len=read.seq.length();
176 >
177 >        // daehwan - check this later, it's due to bowtie
178 >        if (color && read.seq[1] == '4') {
179 >          ++num_reads_chucked;
180 >          continue;
181 >          }
182 >
183 >        if (readmap_loaded && check_readmap(next_id)) {
184 >          ++num_reads_chucked;
185 >          ++multimap_chucked;
186 >          continue;
187 >          }
188 >              format_qual_string(read.qual);
189          std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
79        ++num_reads;
80        ++next_id;
190          char counts[256];
191          memset(counts, 0, sizeof(counts));
192          // Count up the bad characters
# Line 94 | Line 203
203          double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
204          double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
205  
97        if (reads_format == FASTQ &&
98               read.seq.length() != read.qual.length())
99          {
100            ++num_reads_chucked;
101            continue;
102          }
103
104        // daehwan - check this later, it's due to bowtie
105        if (color && read.seq[1] == '4') {
106          continue;
107          }
206          // Chuck the read if there are at least 5 'N's or if it's mostly
207          // (>90%) 'N's and 'A's
208  
# Line 119 | Line 217
217            }
218          else
219            {
122         /*   if (!fastq_db)
123          {
124            if (reads_format == FASTQ  or (reads_format == FASTA && quals))
125              printf("@%s\n%s\n+\n%s\n",
126                 read.name.c_str(), read.seq.c_str(),read.qual.c_str());
127            else if (reads_format == FASTA)
128              printf(">%s\n%s\n", read.name.c_str(), read.seq.c_str());
129          }
130            else
131          { */
220              if (reads_format == FASTQ or (reads_format == FASTA && quals))
221                {
222 <                printf("@%d\n%s\n+%s\n%s\n",
223 <                   next_id,
224 <                   read.seq.c_str(),
225 <                   read.name.c_str(),
226 <                   read.qual.c_str());
222 >
223 >              printf("@%u\n%s\n+%s\n%s\n",
224 >               next_id,
225 >               read.seq.c_str(),
226 >               read.name.c_str(),
227 >               read.qual.c_str());
228                }
229              else if (reads_format == FASTA)
230                {
231                  string qual;
232 <                if (color) {
233 <                  qual = "!";
145 <                  qual += string(read.seq.length()-1, 'I').c_str();
146 <                  }
232 >                if (color)
233 >                  qual = string(read.seq.length()-1, 'I').c_str();
234                  else
235 <              qual = string(read.seq.length(), 'I').c_str();
235 >                  qual = string(read.seq.length(), 'I').c_str();
236  
237 <                printf("@%d\n%s\n+%s\n%s\n",
237 >                printf("@%u\n%s\n+%s\n%s\n",
238                     next_id,
239                     read.seq.c_str(),
240                     read.name.c_str(),
241                     qual.c_str());
242                }
156          //} only used if fastq_db is false (?)
243            }
244        } //while !fr.isEof()
245      fr.close();
246      frq.close();
247      } //for each input file
248 <  
249 <  fprintf(stderr, "%d out of %d reads have been filtered out\n",
250 <          num_reads_chucked, num_reads);
248 >  fprintf(stderr, "%u out of %u reads have been filtered out\n",
249 >          num_reads_chucked, next_id);
250 >  if (readmap_loaded)
251 >    fprintf(stderr, "\t(%u filtered out due to %s)\n",
252 >        multimap_chucked, flt_reads.c_str());
253    if (fw!=NULL) {
254      fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
255      fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
256 <    fprintf(fw, "reads_in =%d\n",num_reads);
257 <    fprintf(fw, "reads_out=%d\n",num_reads-num_reads_chucked);
256 >    fprintf(fw, "reads_in =%d\n",next_id);
257 >    fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
258      fclose(fw);
259      }
260   }
261  
262   void print_usage()
263   {
264 <  fprintf(stderr, "Usage:   prep_reads <reads1.fa/fq,...,readsN.fa/fq>\n");
264 >  fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
265 >  //alternate usage (--ctv_to_num) : doesn't filter out any reads,
266 >  //but simply convert read names to numeric ids
267   }
268  
269  
# Line 231 | Line 321
321            quals_files.push_back(seg_file);
322          }
323      }
324 <  
324 >  load_readmap();
325    // Only print to standard out the good reads
326    //TODO: a better, more generic read filtering protocol
327 <  filter_garbage_reads(reads_files, quals_files);
327 >  if (flt_mappings.empty())
328 >     process_reads(reads_files, quals_files);
329 >  else //special use case: filter previous bowtie results
330 >     flt_reads_and_hits(reads_files);
331      
332    return 0;
333   }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines