ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
(Generate patch)
# Line 39 | Line 39
39   bool readmap_loaded=false;
40   vector<bool> readmap; //for filter_multihits
41  
42
42   void load_readmap() {
43    readmap_loaded=false;
44    if (flt_reads.empty()) return;
45    //FZPipe(filter_multihits, false);
46 +  ReadStream rdstream(flt_reads, NULL, true);
47 +  /*
48    FILE* rfile=fopen(flt_reads.c_str(),"r");
49    if (rfile==NULL)
50       err_die("Error: cannot read file %s\n", flt_reads.c_str());
51    FLineReader fr(rfile);
52    skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
53 +  */
54    Read read;
55 <  while (next_fastx_read(fr, read, reads_format)) {
55 >  //while (next_fastx_read(fr, read, reads_format)) {
56 >  while (rdstream.get_direct(read, reads_format)) {
57      uint32_t rnum=(uint32_t)atol(read.name.c_str());
58      if (rnum>=(uint32_t) readmap.size())
59           readmap.resize(rnum+1, false);
60      readmap[rnum] = true;
61      }
62    readmap_loaded=true;
63 <  fclose(rfile);
63 >  //fclose(rfile);
64   }
65  
66   bool check_readmap(uint32_t read_id) {
67   if (read_id>=readmap.size()) return false;
68      else return readmap[read_id];
69   }
70 < void flt_reads_and_hits(vector<FZPipe>& reads_files) {
70 >
71 > void flt_reads_and_hits(vector<string>& reads_files) {
72    if (!readmap_loaded)
73        err_die("Error: filtering reads not enabled, aborting.");
74    if (aux_outfile.empty())
75         err_die("Error: auxiliary output file not provided.");
76 +  //if (index_outfile.empty())
77 +  //  err_die("Error: index output file not provided.");
78 +  
79    // -- filter mappings:
80    string fext=getFext(flt_mappings);
81 +  fprintf(stderr, "[[[ DEBUG: ]]] fext='%s'\n",fext.c_str());
82    if (fext=="bam") {
83 <    //TODO
84 <    //WRITEME: use samtools C API to write a quick bam filter
85 <    err_die("Error: cannot handle BAM files yet![WRITEME]");
86 <    return;
87 <    }
88 <  bool is_sam=false;
89 <  string s(flt_mappings);
90 <  for(size_t i=0; i!=s.length(); ++i)
91 <          s[i] = std::tolower(s[i]);
92 <  if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
93 <  string unzcmd=getUnpackCmd(flt_mappings);
94 <  FZPipe hitsfile(flt_mappings, unzcmd);
95 <  FLineReader fh(hitsfile);
96 <  FZPipe outfile;
97 <  outfile.openWrite(aux_outfile.c_str(), zpacker);
98 <  if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
99 <  const char* line;
92 <  while ((line=fh.nextLine())) {
93 <    if (is_sam && line[0]=='@') {
94 <        fprintf(outfile.file, "%s\n", line);
95 <        continue;
96 <        }
97 <    char* tab=strchr((char*)line, '\t');
98 <    if (tab==NULL)
99 <        err_die("Error: cannot find tab character in %s mappings line:\n%s",
100 <            flt_mappings.c_str(),line);
101 <    *tab=0;
102 <    uint32_t rid = (uint32_t) atol(line);
103 <    if (rid==0)
104 <      err_die("Error: invalid read ID (%s) parsed from mapping file %s",
105 <            line, flt_mappings.c_str());
106 <    *tab='\t';
107 <    if (check_readmap(rid)) {
108 <       fprintf(outfile.file, "%s\n", line);
109 <       }
83 >        samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0);
84 >    if (fbam==NULL)
85 >        err_die("Error opening BAM file %s!\n", flt_mappings.c_str());
86 >    bam1_t *b = bam_init1();
87 >    string aux_index(aux_outfile);
88 >    aux_index+=".index";
89 >    GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index);
90 >    while (samread(fbam, b) > 0) {
91 >      char* rname  = bam1_qname(b);
92 >      uint32_t rid=(uint32_t)atol(rname);
93 >      if (check_readmap(rid)) {
94 >         //write this mapping into the output file
95 >         wbam.write(b, rid);
96 >         }
97 >      }
98 >    bam_destroy1(b);
99 >    samclose(fbam);
100      }
101 <  outfile.close();
102 <  hitsfile.close();
101 >  else {
102 >        bool is_sam=false;
103 >        string s(flt_mappings);
104 >        for(size_t i=0; i!=s.length(); ++i)
105 >                        s[i] = std::tolower(s[i]);
106 >        if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
107 >        string unzcmd=getUnpackCmd(flt_mappings);
108 >        FZPipe hitsfile(flt_mappings, unzcmd);
109 >        FLineReader fh(hitsfile);
110 >        FZPipe outfile;
111 >        outfile.openWrite(aux_outfile.c_str(), zpacker);
112 >        if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
113 >        const char* line;
114 >        while ((line=fh.nextLine())) {
115 >          if (is_sam && line[0]=='@') {
116 >                  //copy the header
117 >                  fprintf(outfile.file, "%s\n", line);
118 >                  continue;
119 >                  }
120 >          char* tab=strchr((char*)line, '\t');
121 >          if (tab==NULL)
122 >                  err_die("Error: cannot find tab character in %s mappings line:\n%s",
123 >                          flt_mappings.c_str(),line);
124 >          *tab=0;
125 >          uint32_t rid = (uint32_t) atol(line);
126 >          if (rid==0)
127 >                err_die("Error: invalid read ID (%s) parsed from mapping file %s",
128 >                          line, flt_mappings.c_str());
129 >          *tab='\t';
130 >          if (check_readmap(rid)) {
131 >                 fprintf(outfile.file, "%s\n", line);
132 >                 }
133 >          }
134 >        outfile.close();
135 >        hitsfile.close();
136 >        }
137    // -- now filter reads
138 +  //FILE* findex = NULL;
139 +  GBamWriter* wbam=NULL;
140 +  FILE* fout=NULL;
141 +  if (std_outfile.empty()) {
142 +    fout=stdout;
143 +  }
144 +  else { //output file name explicitely given
145 +        if (getFext(std_outfile)=="bam") {
146 +          if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
147 +          wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
148 +          //wbam = new GBamWriter(std_outfile, index_outfile);
149 +        }
150 +        else {
151 +          fout = fopen(std_outfile.c_str(), "w");
152 +          if (fout==NULL)
153 +               err_die("Error: cannot create file %s\n", std_outfile.c_str());
154 +        }
155 +  }
156 +  /*
157 +  if (wbam==NULL && !index_outfile.empty()) {
158 +    findex = fopen(index_outfile.c_str(), "w");
159 +    if (findex == NULL)
160 +       err_die("Error: cannot create file %s\n", index_outfile.c_str());
161 +    }
162 +    */
163    for (size_t fi = 0; fi < reads_files.size(); ++fi) {
164        //only one file expected here, this is not the initial prep_reads
165        Read read;
166 <      FLineReader fr(reads_files[fi]);
166 >      ReadStream readstream(reads_files[fi], NULL, true);
167        //skip_lines(fr);
168 <      while (next_fastx_read(fr, read)) {
168 >      while (readstream.get_direct(read)) {
169          uint32_t rnum=(uint32_t)atol(read.name.c_str());
170 <        if (check_readmap(rnum))
171 <          printf("@%s\n%s\n+%s\n%s\n",
172 <             read.name.c_str(),
173 <             read.seq.c_str(),
174 <             read.alt_name.c_str(),
175 <             read.qual.c_str());
170 >        if (check_readmap(rnum)) {
171 >          if (wbam) {
172 >                GBamRecord bamrec(read.name.c_str(), -1, 0, false,
173 >                        read.seq.c_str(), NULL, read.qual.c_str());
174 >                wbam->write(bamrec.get_b(), rnum);
175 >          }
176 >          else {
177 >                fprintf(fout, "@%s\n%s\n+%s\n%s\n",
178 >                read.name.c_str(),
179 >                read.seq.c_str(),
180 >                read.alt_name.c_str(),
181 >                read.qual.c_str());
182 >          }
183          }
184 <    }
184 >      }
185 >  }
186 > if (wbam) delete wbam;
187 > if (fout && fout!=stdout) fclose(fout);
188   }
189  
190  
191 < void process_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
191 > void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) {
192 >  if (wbam==NULL) return;
193 >  string rnum;
194 >  str_appendUInt(rnum,rid);
195 >  string rname(read.name);
196 >  size_t pl=rname.length()-1;
197 >  GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str());
198 >  int matenum=0;
199 >  if (rname[pl-1]=='/') {
200 >                if (rname[pl]=='1') {
201 >                    bamrec.set_flag(BAM_FREAD1);
202 >                        matenum=1;
203 >                }
204 >                else if (rname[pl]=='2') {
205 >                    bamrec.set_flag(BAM_FREAD2);
206 >                        matenum=2;
207 >                }
208 >                if (matenum) rname.resize(pl-1);
209 >        }
210 >
211 >  bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str());
212 >  if (trashcode) {
213 >         bamrec.set_flag(BAM_FQCFAIL);
214 >     bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode);
215 >     }
216 >  wbam->write(bamrec.get_b(), rid);
217 > }
218 >
219 > void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files)
220   {      
221     //TODO: add the option to write the garbage reads into separate file(s)
222    int num_reads_chucked = 0;
223 <  int multimap_chucked=0;
223 >  int multimap_chucked = 0;
224    int min_read_len = 20000000;
225    int max_read_len = 0;
226    uint32_t next_id = 0;
227 +  uint64_t file_offset = 0;
228    FILE* fw=NULL;
229    if (!aux_outfile.empty()) {
230      fw=fopen(aux_outfile.c_str(), "w");
# Line 144 | Line 232
232         err_die("Error: cannot create file %s\n",aux_outfile.c_str());
233      }
234  
235 +  FILE* findex = NULL;
236 +  GBamWriter* wbam=NULL;
237 +  FILE* fout=NULL;
238 +  if (std_outfile.empty()) {
239 +    fout=stdout;
240 +  }
241 +  else { //output file name explicitely given
242 +        if (getFext(std_outfile)=="bam") {
243 +          if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
244 +          wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
245 +          //wbam = new GBamWriter(std_outfile, index_outfile);
246 +        }
247 +        else {
248 +          fout = fopen(std_outfile.c_str(), "w");
249 +          if (fout==NULL)
250 +               err_die("Error: cannot create file %s\n", std_outfile.c_str());
251 +        }
252 +  }
253 +  if (wbam==NULL && !index_outfile.empty()) {
254 +    findex = fopen(index_outfile.c_str(), "w");
255 +    if (findex == NULL)
256 +       err_die("Error: cannot create file %s\n", index_outfile.c_str());
257 +    }
258 +
259    for (size_t fi = 0; fi < reads_files.size(); ++fi)
260      {
261        Read read;
262 <      FLineReader fr(reads_files[fi]);
151 <      skip_lines(fr);
152 <      FZPipe fq;
262 >      FZPipe* fq=NULL;
263        if (quals)
264 <              fq = quals_files[fi];
265 <      FLineReader frq(fq);
156 <      skip_lines(frq);
264 >         fq = & quals_files[fi];
265 >      ReadStream reads(reads_files[fi], fq, true);
266        
267 <      while (!fr.isEof()) {
268 <            //read.clear();
267 >      while (reads.get_direct(read, reads_format)) {
268 >            // read.clear();
269              // Get the next read from the file
270 <        if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
271 <            break;
270 >        //if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
271 >        //    break;
272  
273          ++next_id;
274          //IMPORTANT: to keep paired reads in sync, this must be
275          //incremented BEFORE any reads are chucked !
167
276          if (read.seq.length()<12) {
277              ++num_reads_chucked;
278 +            writePrepBam(wbam, read, next_id, 'S');
279              continue;
280              }
281          if ((int)read.seq.length()<min_read_len)
# Line 177 | Line 286
286          // daehwan - check this later, it's due to bowtie
287          if (color && read.seq[1] == '4') {
288            ++num_reads_chucked;
289 +          writePrepBam(wbam, read, next_id, 'c');
290            continue;
291            }
292  
293          if (readmap_loaded && check_readmap(next_id)) {
294            ++num_reads_chucked;
295            ++multimap_chucked;
296 +          writePrepBam(wbam, read, next_id, 'M');
297            continue;
298            }
299 <              format_qual_string(read.qual);
299 >        format_qual_string(read.qual);
300          std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
301          char counts[256];
302          memset(counts, 0, sizeof(counts));
# Line 205 | Line 316
316  
317          // Chuck the read if there are at least 5 'N's or if it's mostly
318          // (>90%) 'N's and 'A's
319 <
319 >        char trash_code=0;
320          if (percent_A > 0.9 ||
321              percent_C > 0.9 ||
322              percent_G > 0.9 ||
323 <            percent_T > 0.9 ||
324 <            percent_N >= 0.1 ||
325 <            percent_4 >= 0.1)
326 <          {
327 <            ++num_reads_chucked;
328 <          }
329 <        else
330 <          {
331 <            if (reads_format == FASTQ or (reads_format == FASTA && quals))
332 <              {
323 >            percent_T > 0.9)
324 >              trash_code='L';
325 >        else if (percent_N >= 0.1 || percent_4 >=0.1)
326 >              trash_code='N';
327 >        if (trash_code) {
328 >          ++num_reads_chucked;
329 >          writePrepBam(wbam, read, next_id, trash_code);
330 >          continue;
331 >        }
332 >
333 >        if (wbam) {
334 >          writePrepBam(wbam, read, next_id);
335 >        }
336 >        else {
337 >                  // daehwan - we should not use buf in printf function
338 >                  // because it may contain some control characters such as "\" from quality values.
339 >                  // Here, buf is only used for calculating the file offset
340 >                  char buf[2048] = {0};
341 >                  if (reads_format == FASTQ or (reads_format == FASTA && quals))
342 >                                {
343 >                  sprintf(buf, "@%u\n%s\n+%s\n%s\n",
344 >                          next_id,
345 >                          read.seq.c_str(),
346 >                          read.name.c_str(),
347 >                          read.qual.c_str());
348 >
349 >                  fprintf(fout, "@%u\n%s\n+%s\n%s\n",
350 >                                 next_id,
351 >                                 read.seq.c_str(),
352 >                                 read.name.c_str(),
353 >                                 read.qual.c_str());
354 >                                }
355 >                          else if (reads_format == FASTA)
356 >                                {
357 >                                  string qual;
358 >                                  if (color)
359 >                                        qual = string(read.seq.length()-1, 'I').c_str();
360 >                                  else
361 >                                        qual = string(read.seq.length(), 'I').c_str();
362 >
363 >                                  sprintf(buf, "@%u\n%s\n+%s\n%s\n",
364 >                          next_id,
365 >                          read.seq.c_str(),
366 >                          read.name.c_str(),
367 >                          qual.c_str());
368 >
369 >                  fprintf(fout, "@%u\n%s\n+%s\n%s\n",
370 >                          next_id,
371 >                          read.seq.c_str(),
372 >                          read.name.c_str(),
373 >                          qual.c_str());
374 >                                }
375 >                  else
376 >                        {
377 >                  assert(0);
378 >                        }
379 >
380 >                  if (findex != NULL)
381 >                        {
382 >                  if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0)
383 >                        fprintf(findex, "%d\t%lu\n", next_id, file_offset);
384 >                        }
385  
386 <              printf("@%u\n%s\n+%s\n%s\n",
224 <               next_id,
225 <               read.seq.c_str(),
226 <               read.name.c_str(),
227 <               read.qual.c_str());
228 <              }
229 <            else if (reads_format == FASTA)
230 <              {
231 <                string qual;
232 <                if (color)
233 <                  qual = string(read.seq.length()-1, 'I').c_str();
234 <                else
235 <                  qual = string(read.seq.length(), 'I').c_str();
236 <
237 <                printf("@%u\n%s\n+%s\n%s\n",
238 <                   next_id,
239 <                   read.seq.c_str(),
240 <                   read.name.c_str(),
241 <                   qual.c_str());
242 <              }
386 >                  file_offset += strlen(buf);
387            }
388        } //while !fr.isEof()
389 <    fr.close();
390 <    frq.close();
389 >    //fr.close();
390 >    //frq.close();
391      } //for each input file
392    fprintf(stderr, "%u out of %u reads have been filtered out\n",
393            num_reads_chucked, next_id);
394    if (readmap_loaded)
395      fprintf(stderr, "\t(%u filtered out due to %s)\n",
396          multimap_chucked, flt_reads.c_str());
397 +  if (wbam) { delete wbam; }
398 +  if (fout && fout!=stdout) fclose(fout);
399    if (fw!=NULL) {
400      fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
401      fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
# Line 257 | Line 403
403      fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
404      fclose(fw);
405      }
406 +
407 +  if (findex != NULL)
408 +    fclose(findex);
409   }
410  
411   void print_usage()
# Line 284 | Line 433
433    
434    string reads_file_list = argv[optind++];
435    
436 <  vector<string> reads_file_names;
437 <  vector<FZPipe> reads_files;
289 <  tokenize(reads_file_list, ",",reads_file_names);
290 <  for (size_t i = 0; i < reads_file_names.size(); ++i)
291 <    {
292 <      string fname=reads_file_names[i];
293 <      string pipecmd=guess_packer(fname, true);
294 <      if (!pipecmd.empty()) pipecmd.append(" -cd");
295 <      FZPipe seg_file(fname,pipecmd);
296 <      if (seg_file.file == NULL) {
297 <          fprintf(stderr, "Error: cannot open reads file %s for reading\n",
298 <              reads_file_names[i].c_str());
299 <              exit(1);
300 <          }
301 <      reads_files.push_back(seg_file);
302 <    }
303 <  
436 >  vector<string> reads_filenames;
437 >  tokenize(reads_file_list, ",",reads_filenames);
438    vector<string> quals_file_names;
439    vector<FZPipe> quals_files;
440    if (quals)
# Line 309 | Line 443
443        tokenize(quals_file_list, ",", quals_file_names);
444        for (size_t i = 0; i < quals_file_names.size(); ++i)
445          {
446 <      string fname(quals_file_names[i]);
313 <      string pipecmd=guess_packer(fname, true);
314 <      FZPipe seg_file(fname, pipecmd);
446 >      FZPipe seg_file(quals_file_names[i], true);
447            if (seg_file.file == NULL)
448              {
449                fprintf(stderr, "Error: cannot open reads file %s for reading\n",
# Line 325 | Line 457
457    // Only print to standard out the good reads
458    //TODO: a better, more generic read filtering protocol
459    if (flt_mappings.empty())
460 <     process_reads(reads_files, quals_files);
460 >     process_reads(reads_filenames, quals_files);
461    else //special use case: filter previous bowtie results
462 <     flt_reads_and_hits(reads_files);
462 >     flt_reads_and_hits(reads_filenames);
463      
464    return 0;
465   }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines