ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
(Generate patch)
# Line 39 | Line 39
39   bool readmap_loaded=false;
40   vector<bool> readmap; //for filter_multihits
41  
42
42   void load_readmap() {
43    readmap_loaded=false;
44    if (flt_reads.empty()) return;
# Line 69 | Line 68
68        err_die("Error: filtering reads not enabled, aborting.");
69    if (aux_outfile.empty())
70         err_die("Error: auxiliary output file not provided.");
71 +  if (index_outfile.empty())
72 +    err_die("Error: index output file not provided.");
73 +  
74    // -- filter mappings:
75    string fext=getFext(flt_mappings);
76    if (fext=="bam") {
# Line 133 | Line 135
135   {      
136     //TODO: add the option to write the garbage reads into separate file(s)
137    int num_reads_chucked = 0;
138 <  int multimap_chucked=0;
138 >  int multimap_chucked = 0;
139    int min_read_len = 20000000;
140    int max_read_len = 0;
141    uint32_t next_id = 0;
142 +  uint64_t file_offset = 0;
143    FILE* fw=NULL;
144    if (!aux_outfile.empty()) {
145      fw=fopen(aux_outfile.c_str(), "w");
# Line 144 | Line 147
147         err_die("Error: cannot create file %s\n",aux_outfile.c_str());
148      }
149  
150 +  FILE* findex = NULL;
151 +  if (!index_outfile.empty()) {
152 +    findex = fopen(index_outfile.c_str(), "w");
153 +    if (findex == NULL)
154 +       err_die("Error: cannot create file %s\n", index_outfile.c_str());
155 +    }
156 +
157    for (size_t fi = 0; fi < reads_files.size(); ++fi)
158      {
159        Read read;
160 <      FLineReader fr(reads_files[fi]);
161 <      skip_lines(fr);
162 <      FZPipe fq;
160 >      //FLineReader fr(reads_files[fi]);
161 >      //skip_lines(fr);
162 >      FZPipe* fq=NULL;
163        if (quals)
164 <              fq = quals_files[fi];
165 <      FLineReader frq(fq);
166 <      skip_lines(frq);
164 >         fq = & quals_files[fi];
165 >      ReadStream reads;
166 >      reads.init(reads_files[fi], fq);
167        
168 <      while (!fr.isEof()) {
169 <            //read.clear();
168 >      while (reads.get_direct(read, reads_format)) {
169 >            // read.clear();
170              // Get the next read from the file
171 <        if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
172 <            break;
171 >        //if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
172 >        //    break;
173  
174          ++next_id;
175          //IMPORTANT: to keep paired reads in sync, this must be
# Line 185 | Line 195
195            ++multimap_chucked;
196            continue;
197            }
198 <              format_qual_string(read.qual);
198 >        format_qual_string(read.qual);
199          std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
200          char counts[256];
201          memset(counts, 0, sizeof(counts));
# Line 217 | Line 227
227            }
228          else
229            {
230 +            // daehwan - we should not use buf in printf function
231 +            // because it may contain some control characters such as "\" from quality values.
232 +            // Here, buf is only used for calculating the file offset
233 +            char buf[2048] = {0};
234              if (reads_format == FASTQ or (reads_format == FASTA && quals))
235                {
236 <
237 <              printf("@%u\n%s\n+%s\n%s\n",
238 <               next_id,
239 <               read.seq.c_str(),
240 <               read.name.c_str(),
241 <               read.qual.c_str());
236 >                sprintf(buf, "@%u\n%s\n+%s\n%s\n",
237 >                        next_id,
238 >                        read.seq.c_str(),
239 >                        read.name.c_str(),
240 >                        read.qual.c_str());
241 >
242 >                printf("@%u\n%s\n+%s\n%s\n",
243 >                       next_id,
244 >                       read.seq.c_str(),
245 >                       read.name.c_str(),
246 >                       read.qual.c_str());
247                }
248              else if (reads_format == FASTA)
249                {
# Line 234 | Line 253
253                  else
254                    qual = string(read.seq.length(), 'I').c_str();
255  
256 <                printf("@%u\n%s\n+%s\n%s\n",
257 <                   next_id,
258 <                   read.seq.c_str(),
259 <                   read.name.c_str(),
260 <                   qual.c_str());
256 >                sprintf(buf, "@%u\n%s\n+%s\n%s\n",
257 >                        next_id,
258 >                        read.seq.c_str(),
259 >                        read.name.c_str(),
260 >                        qual.c_str());
261 >
262 >                printf("@%u\n%s\n+%s\n%s\n",
263 >                        next_id,
264 >                        read.seq.c_str(),
265 >                        read.name.c_str(),
266 >                        qual.c_str());
267                }
268 +            else
269 +              {
270 +                assert(0);
271 +              }
272 +
273 +            if (findex != NULL)
274 +              {
275 +                if ((next_id - num_reads_chucked) % 1000 == 0)
276 +                  fprintf(findex, "%d\t%lu\n", next_id, file_offset);
277 +              }
278 +
279 +            file_offset += strlen(buf);
280            }
281        } //while !fr.isEof()
282 <    fr.close();
283 <    frq.close();
282 >    //fr.close();
283 >    //frq.close();
284      } //for each input file
285    fprintf(stderr, "%u out of %u reads have been filtered out\n",
286            num_reads_chucked, next_id);
# Line 257 | Line 294
294      fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
295      fclose(fw);
296      }
297 +
298 +  if (findex != NULL)
299 +    fclose(findex);
300   }
301  
302   void print_usage()
# Line 290 | Line 330
330    for (size_t i = 0; i < reads_file_names.size(); ++i)
331      {
332        string fname=reads_file_names[i];
333 <      string pipecmd=guess_packer(fname, true);
294 <      if (!pipecmd.empty()) pipecmd.append(" -cd");
295 <      FZPipe seg_file(fname,pipecmd);
333 >      FZPipe seg_file(fname, true); //guess unpacker
334        if (seg_file.file == NULL) {
335            fprintf(stderr, "Error: cannot open reads file %s for reading\n",
336                reads_file_names[i].c_str());
# Line 309 | Line 347
347        tokenize(quals_file_list, ",", quals_file_names);
348        for (size_t i = 0; i < quals_file_names.size(); ++i)
349          {
350 <      string fname(quals_file_names[i]);
313 <      string pipecmd=guess_packer(fname, true);
314 <      FZPipe seg_file(fname, pipecmd);
350 >      FZPipe seg_file(quals_file_names[i], true);
351            if (seg_file.file == NULL)
352              {
353                fprintf(stderr, "Error: cannot open reads file %s for reading\n",

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines