gclib/tophat_cpp/prep_reads.cpp

/*
 *  polyA_reads.cpp
 *  TopHat
 *
 *  Created by Cole Trapnell on 9/2/08.
 *  Copyright 2008 Cole Trapnell. All rights reserved.
 *      Derived from maq "catfilter", by Heng Li at Sanger
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <cassert>
#include <vector>
#include <cstring>
#include <cstdlib>

#include "common.h"
#include "reads.h"
#include "tokenize.h"
#include "qual.h"

//bool fastq_db = true;

using namespace std;

void format_qual_string(string& qual_str)
{
  for (size_t i = 0; i < qual_str.size(); ++i)
    {
      qual_str[i] = charToPhred33(qual_str[i], 
                                      solexa_quals, 
                                      phred64_quals);
    }
}

bool readmap_loaded=false;
vector<bool> readmap; //for filter_multihits

void load_readmap() {
  readmap_loaded=false;
  if (flt_reads.empty()) return;
  //FZPipe(filter_multihits, false);
  ReadStream rdstream(flt_reads, NULL, true);
  /*
  FILE* rfile=fopen(flt_reads.c_str(),"r");
  if (rfile==NULL)
     err_die("Error: cannot read file %s\n", flt_reads.c_str());
  FLineReader fr(rfile);
  skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
  */
  Read read;
  //while (next_fastx_read(fr, read, reads_format)) {
  while (rdstream.get_direct(read, reads_format)) {
    uint32_t rnum=(uint32_t)atol(read.name.c_str());
    if (rnum>=(uint32_t) readmap.size())
         readmap.resize(rnum+1, false);
    readmap[rnum] = true;
    }
  readmap_loaded=true;
  //fclose(rfile);
}

bool check_readmap(uint32_t read_id) {
 if (read_id>=readmap.size()) return false;
    else return readmap[read_id];
}

void flt_reads_and_hits(vector<string>& reads_files) {
  if (!readmap_loaded)
      err_die("Error: filtering reads not enabled, aborting.");
  if (aux_outfile.empty())
       err_die("Error: auxiliary output file not provided.");
  //if (index_outfile.empty())
  //  err_die("Error: index output file not provided.");
  
  // -- filter mappings:
  string fext=getFext(flt_mappings);
  if (fext=="bam") {
        samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0);
    if (fbam==NULL)
        err_die("Error opening BAM file %s!\n", flt_mappings.c_str());
    bam1_t *b = bam_init1();
    string aux_index(aux_outfile);
    aux_index+=".index";
    GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index);
    while (samread(fbam, b) > 0) {
      char* rname  = bam1_qname(b);
      uint32_t rid=(uint32_t)atol(rname);
      if (check_readmap(rid)) {
         //write this mapping into the output file
         wbam.write(b, rid);
         }
      }
    bam_destroy1(b);
    samclose(fbam);
    }
  else {
        bool is_sam=false;
        string s(flt_mappings);
        for(size_t i=0; i!=s.length(); ++i)
                        s[i] = std::tolower(s[i]);
        if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
        string unzcmd=getUnpackCmd(flt_mappings);
        FZPipe hitsfile(flt_mappings, unzcmd);
        FLineReader fh(hitsfile);
        FZPipe outfile;
        outfile.openWrite(aux_outfile.c_str(), zpacker);
        if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
        const char* line;
        while ((line=fh.nextLine())) {
          if (is_sam && line[0]=='@') {
                  //copy the header
                  fprintf(outfile.file, "%s\n", line);
                  continue;
                  }
          char* tab=strchr((char*)line, '\t');
          if (tab==NULL)
                  err_die("Error: cannot find tab character in %s mappings line:\n%s",
                          flt_mappings.c_str(),line);
          *tab=0;
          uint32_t rid = (uint32_t) atol(line);
          if (rid==0)
                err_die("Error: invalid read ID (%s) parsed from mapping file %s",
                          line, flt_mappings.c_str());
          *tab='\t';
          if (check_readmap(rid)) {
                 fprintf(outfile.file, "%s\n", line);
                 }
          }
        outfile.close();
        hitsfile.close();
        }
  // -- now filter reads
  for (size_t fi = 0; fi < reads_files.size(); ++fi) {
      //only one file expected here, this is not the initial prep_reads
      Read read;
      ReadStream readstream(reads_files[fi], NULL, true);
      //skip_lines(fr);
      while (readstream.get_direct(read)) {
        uint32_t rnum=(uint32_t)atol(read.name.c_str());
        if (check_readmap(rnum))
          printf("@%s\n%s\n+%s\n%s\n",
             read.name.c_str(),
             read.seq.c_str(),
             read.alt_name.c_str(),
             read.qual.c_str());
        }
    }
}


void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) {
  if (wbam==NULL) return;
  string rnum;
  str_appendUInt(rnum,rid);
  string rname(read.name);
  size_t pl=rname.length()-1;
  GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str());
  int matenum=0;
  if (rname[pl-1]=='/') {
                if (rname[pl]=='1') {
                    bamrec.set_flag(BAM_FREAD1);
                        matenum=1;
                }
                else if (rname[pl]=='2') {
                    bamrec.set_flag(BAM_FREAD2);
                        matenum=2;
                }
                if (matenum) rname.resize(pl-1);
        }

  bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str());
  if (trashcode) {
         bamrec.set_flag(BAM_FQCFAIL);
     bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode);
     }
  wbam->write(bamrec.get_b(), rid);
}

void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files)
{       
   //TODO: add the option to write the garbage reads into separate file(s)
  int num_reads_chucked = 0;
  int multimap_chucked = 0;
  int min_read_len = 20000000;
  int max_read_len = 0;
  uint32_t next_id = 0;
  uint64_t file_offset = 0;
  FILE* fw=NULL;
  if (!aux_outfile.empty()) {
    fw=fopen(aux_outfile.c_str(), "w");
    if (fw==NULL)
       err_die("Error: cannot create file %s\n",aux_outfile.c_str());
    }

  FILE* findex = NULL;
  GBamWriter* wbam=NULL;
  FILE* fout=NULL;
  if (std_outfile.empty()) {
    fout=stdout;
  }
  else { //output file name explicitely given
        if (getFext(std_outfile)=="bam") {
          if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
          wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
          //wbam = new GBamWriter(std_outfile, index_outfile);
        }
        else {
          fout = fopen(std_outfile.c_str(), "w");
          if (fout==NULL)
               err_die("Error: cannot create file %s\n", std_outfile.c_str());
        }
  }
  if (wbam==NULL && !index_outfile.empty()) {
    findex = fopen(index_outfile.c_str(), "w");
    if (findex == NULL)
       err_die("Error: cannot create file %s\n", index_outfile.c_str());
    }

  for (size_t fi = 0; fi < reads_files.size(); ++fi)
    {
      Read read;
      FZPipe* fq=NULL;
      if (quals)
         fq = & quals_files[fi];
      ReadStream reads(reads_files[fi], fq, true);
      
      while (reads.get_direct(read, reads_format)) {
            // read.clear();
            // Get the next read from the file
        //if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
        //    break;

        ++next_id;
        //IMPORTANT: to keep paired reads in sync, this must be
        //incremented BEFORE any reads are chucked !
        if (read.seq.length()<12) {
            ++num_reads_chucked;
            writePrepBam(wbam, read, next_id, 'S');
            continue;
            }
        if ((int)read.seq.length()<min_read_len)
             min_read_len=read.seq.length();
        if ((int)read.seq.length()>max_read_len)
             max_read_len=read.seq.length();

        // daehwan - check this later, it's due to bowtie
        if (color && read.seq[1] == '4') {
          ++num_reads_chucked;
          writePrepBam(wbam, read, next_id, 'c');
          continue;
          }

        if (readmap_loaded && check_readmap(next_id)) {
          ++num_reads_chucked;
          ++multimap_chucked;
          writePrepBam(wbam, read, next_id, 'M');
          continue;
          }
        format_qual_string(read.qual);
        std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
        char counts[256];
        memset(counts, 0, sizeof(counts));
        // Count up the bad characters
        for (unsigned int i = 0; i != read.seq.length(); ++i)
          {
            char c = (char)toupper(read.seq[i]);
            counts[(size_t)c]++;
          }

        double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
        double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
        double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
        double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
        double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
        double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();

        // Chuck the read if there are at least 5 'N's or if it's mostly
        // (>90%) 'N's and 'A's
        char trash_code=0;
        if (percent_A > 0.9 ||
            percent_C > 0.9 ||
            percent_G > 0.9 ||
            percent_T > 0.9)
              trash_code='L';
        else if (percent_N >= 0.1 || percent_4 >=0.1)
              trash_code='N';
        if (trash_code) {
          ++num_reads_chucked;
          writePrepBam(wbam, read, next_id, trash_code);
          continue;
        }

        if (wbam) {
          writePrepBam(wbam, read, next_id);
        }
        else {
                  // daehwan - we should not use buf in printf function
                  // because it may contain some control characters such as "\" from quality values.
                  // Here, buf is only used for calculating the file offset
                  char buf[2048] = {0};
                  if (reads_format == FASTQ or (reads_format == FASTA && quals))
                                {
                  sprintf(buf, "@%u\n%s\n+%s\n%s\n",
                          next_id,
                          read.seq.c_str(),
                          read.name.c_str(),
                          read.qual.c_str());

                  fprintf(fout, "@%u\n%s\n+%s\n%s\n",
                                 next_id,
                                 read.seq.c_str(),
                                 read.name.c_str(),
                                 read.qual.c_str());
                                }
                          else if (reads_format == FASTA)
                                {
                                  string qual;
                                  if (color)
                                        qual = string(read.seq.length()-1, 'I').c_str();
                                  else
                                        qual = string(read.seq.length(), 'I').c_str();

                                  sprintf(buf, "@%u\n%s\n+%s\n%s\n",
                          next_id,
                          read.seq.c_str(),
                          read.name.c_str(),
                          qual.c_str());

                  fprintf(fout, "@%u\n%s\n+%s\n%s\n",
                          next_id,
                          read.seq.c_str(),
                          read.name.c_str(),
                          qual.c_str());
                                }
                  else
                        {
                  assert(0);
                        }

                  if (findex != NULL)
                        {
                  if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0)
                        fprintf(findex, "%d\t%lu\n", next_id, file_offset);
                        }

                  file_offset += strlen(buf);
          }
      } //while !fr.isEof()
    //fr.close();
    //frq.close();
    } //for each input file
  fprintf(stderr, "%u out of %u reads have been filtered out\n",
          num_reads_chucked, next_id);
  if (readmap_loaded)
    fprintf(stderr, "\t(%u filtered out due to %s)\n",
        multimap_chucked, flt_reads.c_str());
  if (wbam) { delete wbam; }
  if (fout && fout!=stdout) fclose(fout);
  if (fw!=NULL) {
    fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
    fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
    fprintf(fw, "reads_in =%d\n",next_id);
    fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
    fclose(fw);
    }

  if (findex != NULL)
    fclose(findex);
}

void print_usage()
{
  fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
  //alternate usage (--ctv_to_num) : doesn't filter out any reads,
  //but simply convert read names to numeric ids
}


int main(int argc, char *argv[])
{
  fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); 
  fprintf(stderr, "---------------------------\n");
  
  int parse_ret = parse_options(argc, argv, print_usage);
  if (parse_ret)
    return parse_ret;
  
  if(optind >= argc)
    {
      print_usage();
      return 1;
    }
  
  string reads_file_list = argv[optind++];
  
  vector<string> reads_filenames;
  tokenize(reads_file_list, ",",reads_filenames);
  vector<string> quals_file_names;
  vector<FZPipe> quals_files;
  if (quals)
    {
      string quals_file_list = argv[optind++];
      tokenize(quals_file_list, ",", quals_file_names);
      for (size_t i = 0; i < quals_file_names.size(); ++i)
        {
      FZPipe seg_file(quals_file_names[i], true);
          if (seg_file.file == NULL)
            {
              fprintf(stderr, "Error: cannot open reads file %s for reading\n",
                      quals_file_names[i].c_str());
              exit(1);
              }
          quals_files.push_back(seg_file);
        }
    }
  load_readmap();
  // Only print to standard out the good reads
  //TODO: a better, more generic read filtering protocol
  if (flt_mappings.empty())
     process_reads(reads_filenames, quals_files);
  else //special use case: filter previous bowtie results
     flt_reads_and_hits(reads_filenames);
    
  return 0;
}
Revision:	223
Committed:	Fri Mar 23 22:39:52 2012 UTC (12 years, 6 months ago) by gpertea
File size:	12606 byte(s)
Log Message:
Line	File contents
1	/*
2	* polyA_reads.cpp
3	* TopHat
4	*
5	* Created by Cole Trapnell on 9/2/08.
6	* Copyright 2008 Cole Trapnell. All rights reserved.
7	* Derived from maq "catfilter", by Heng Li at Sanger
8	*/
9
10	#ifdef HAVE_CONFIG_H
11	#include <config.h>
12	#endif
13
14	#include <stdio.h>
15	#include <cassert>
16	#include <vector>
17	#include <cstring>
18	#include <cstdlib>
19
20	#include "common.h"
21	#include "reads.h"
22	#include "tokenize.h"
23	#include "qual.h"
24
25	//bool fastq_db = true;
26
27	using namespace std;
28
29	void format_qual_string(string& qual_str)
30	{
31	for (size_t i = 0; i < qual_str.size(); ++i)
32	{
33	qual_str[i] = charToPhred33(qual_str[i],
34	solexa_quals,
35	phred64_quals);
36	}
37	}
38
39	bool readmap_loaded=false;
40	vector<bool> readmap; //for filter_multihits
41
42	void load_readmap() {
43	readmap_loaded=false;
44	if (flt_reads.empty()) return;
45	//FZPipe(filter_multihits, false);
46	ReadStream rdstream(flt_reads, NULL, true);
47	/*
48	FILE* rfile=fopen(flt_reads.c_str(),"r");
49	if (rfile==NULL)
50	err_die("Error: cannot read file %s\n", flt_reads.c_str());
51	FLineReader fr(rfile);
52	skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
53	*/
54	Read read;
55	//while (next_fastx_read(fr, read, reads_format)) {
56	while (rdstream.get_direct(read, reads_format)) {
57	uint32_t rnum=(uint32_t)atol(read.name.c_str());
58	if (rnum>=(uint32_t) readmap.size())
59	readmap.resize(rnum+1, false);
60	readmap[rnum] = true;
61	}
62	readmap_loaded=true;
63	//fclose(rfile);
64	}
65
66	bool check_readmap(uint32_t read_id) {
67	if (read_id>=readmap.size()) return false;
68	else return readmap[read_id];
69	}
70
71	void flt_reads_and_hits(vector<string>& reads_files) {
72	if (!readmap_loaded)
73	err_die("Error: filtering reads not enabled, aborting.");
74	if (aux_outfile.empty())
75	err_die("Error: auxiliary output file not provided.");
76	//if (index_outfile.empty())
77	// err_die("Error: index output file not provided.");
78
79	// -- filter mappings:
80	string fext=getFext(flt_mappings);
81	if (fext=="bam") {
82	samfile_t* fbam=samopen(flt_mappings.c_str(), "rb", 0);
83	if (fbam==NULL)
84	err_die("Error opening BAM file %s!\n", flt_mappings.c_str());
85	bam1_t *b = bam_init1();
86	string aux_index(aux_outfile);
87	aux_index+=".index";
88	GBamWriter wbam(aux_outfile.c_str(), fbam->header, aux_index);
89	while (samread(fbam, b) > 0) {
90	char* rname = bam1_qname(b);
91	uint32_t rid=(uint32_t)atol(rname);
92	if (check_readmap(rid)) {
93	//write this mapping into the output file
94	wbam.write(b, rid);
95	}
96	}
97	bam_destroy1(b);
98	samclose(fbam);
99	}
100	else {
101	bool is_sam=false;
102	string s(flt_mappings);
103	for(size_t i=0; i!=s.length(); ++i)
104	s[i] = std::tolower(s[i]);
105	if (fext=="sam" \|\| s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
106	string unzcmd=getUnpackCmd(flt_mappings);
107	FZPipe hitsfile(flt_mappings, unzcmd);
108	FLineReader fh(hitsfile);
109	FZPipe outfile;
110	outfile.openWrite(aux_outfile.c_str(), zpacker);
111	if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
112	const char* line;
113	while ((line=fh.nextLine())) {
114	if (is_sam && line[0]=='@') {
115	//copy the header
116	fprintf(outfile.file, "%s\n", line);
117	continue;
118	}
119	char* tab=strchr((char*)line, '\t');
120	if (tab==NULL)
121	err_die("Error: cannot find tab character in %s mappings line:\n%s",
122	flt_mappings.c_str(),line);
123	*tab=0;
124	uint32_t rid = (uint32_t) atol(line);
125	if (rid==0)
126	err_die("Error: invalid read ID (%s) parsed from mapping file %s",
127	line, flt_mappings.c_str());
128	*tab='\t';
129	if (check_readmap(rid)) {
130	fprintf(outfile.file, "%s\n", line);
131	}
132	}
133	outfile.close();
134	hitsfile.close();
135	}
136	// -- now filter reads
137	for (size_t fi = 0; fi < reads_files.size(); ++fi) {
138	//only one file expected here, this is not the initial prep_reads
139	Read read;
140	ReadStream readstream(reads_files[fi], NULL, true);
141	//skip_lines(fr);
142	while (readstream.get_direct(read)) {
143	uint32_t rnum=(uint32_t)atol(read.name.c_str());
144	if (check_readmap(rnum))
145	printf("@%s\n%s\n+%s\n%s\n",
146	read.name.c_str(),
147	read.seq.c_str(),
148	read.alt_name.c_str(),
149	read.qual.c_str());
150	}
151	}
152	}
153
154
155	void writePrepBam(GBamWriter* wbam, Read& read, uint32_t rid, char trashcode=0) {
156	if (wbam==NULL) return;
157	string rnum;
158	str_appendUInt(rnum,rid);
159	string rname(read.name);
160	size_t pl=rname.length()-1;
161	GBamRecord bamrec(rnum.c_str(), -1, 0, false, read.seq.c_str(), NULL, read.qual.c_str());
162	int matenum=0;
163	if (rname[pl-1]=='/') {
164	if (rname[pl]=='1') {
165	bamrec.set_flag(BAM_FREAD1);
166	matenum=1;
167	}
168	else if (rname[pl]=='2') {
169	bamrec.set_flag(BAM_FREAD2);
170	matenum=2;
171	}
172	if (matenum) rname.resize(pl-1);
173	}
174
175	bamrec.add_aux("ZN", 'Z',read.name.length(),(uint8_t*)rname.c_str());
176	if (trashcode) {
177	bamrec.set_flag(BAM_FQCFAIL);
178	bamrec.add_aux("ZT", 'A', 1, (uint8_t*)&trashcode);
179	}
180	wbam->write(bamrec.get_b(), rid);
181	}
182
183	void process_reads(vector<string>& reads_files, vector<FZPipe>& quals_files)
184	{
185	//TODO: add the option to write the garbage reads into separate file(s)
186	int num_reads_chucked = 0;
187	int multimap_chucked = 0;
188	int min_read_len = 20000000;
189	int max_read_len = 0;
190	uint32_t next_id = 0;
191	uint64_t file_offset = 0;
192	FILE* fw=NULL;
193	if (!aux_outfile.empty()) {
194	fw=fopen(aux_outfile.c_str(), "w");
195	if (fw==NULL)
196	err_die("Error: cannot create file %s\n",aux_outfile.c_str());
197	}
198
199	FILE* findex = NULL;
200	GBamWriter* wbam=NULL;
201	FILE* fout=NULL;
202	if (std_outfile.empty()) {
203	fout=stdout;
204	}
205	else { //output file name explicitely given
206	if (getFext(std_outfile)=="bam") {
207	if (sam_header.empty()) err_die("Error: sam header file not provided.\n");
208	wbam = new GBamWriter(std_outfile.c_str(), sam_header.c_str(), index_outfile);
209	//wbam = new GBamWriter(std_outfile, index_outfile);
210	}
211	else {
212	fout = fopen(std_outfile.c_str(), "w");
213	if (fout==NULL)
214	err_die("Error: cannot create file %s\n", std_outfile.c_str());
215	}
216	}
217	if (wbam==NULL && !index_outfile.empty()) {
218	findex = fopen(index_outfile.c_str(), "w");
219	if (findex == NULL)
220	err_die("Error: cannot create file %s\n", index_outfile.c_str());
221	}
222
223	for (size_t fi = 0; fi < reads_files.size(); ++fi)
224	{
225	Read read;
226	FZPipe* fq=NULL;
227	if (quals)
228	fq = & quals_files[fi];
229	ReadStream reads(reads_files[fi], fq, true);
230
231	while (reads.get_direct(read, reads_format)) {
232	// read.clear();
233	// Get the next read from the file
234	//if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
235	// break;
236
237	++next_id;
238	//IMPORTANT: to keep paired reads in sync, this must be
239	//incremented BEFORE any reads are chucked !
240	if (read.seq.length()<12) {
241	++num_reads_chucked;
242	writePrepBam(wbam, read, next_id, 'S');
243	continue;
244	}
245	if ((int)read.seq.length()<min_read_len)
246	min_read_len=read.seq.length();
247	if ((int)read.seq.length()>max_read_len)
248	max_read_len=read.seq.length();
249
250	// daehwan - check this later, it's due to bowtie
251	if (color && read.seq[1] == '4') {
252	++num_reads_chucked;
253	writePrepBam(wbam, read, next_id, 'c');
254	continue;
255	}
256
257	if (readmap_loaded && check_readmap(next_id)) {
258	++num_reads_chucked;
259	++multimap_chucked;
260	writePrepBam(wbam, read, next_id, 'M');
261	continue;
262	}
263	format_qual_string(read.qual);
264	std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
265	char counts[256];
266	memset(counts, 0, sizeof(counts));
267	// Count up the bad characters
268	for (unsigned int i = 0; i != read.seq.length(); ++i)
269	{
270	char c = (char)toupper(read.seq[i]);
271	counts[(size_t)c]++;
272	}
273
274	double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
275	double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
276	double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
277	double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
278	double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
279	double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
280
281	// Chuck the read if there are at least 5 'N's or if it's mostly
282	// (>90%) 'N's and 'A's
283	char trash_code=0;
284	if (percent_A > 0.9 \|\|
285	percent_C > 0.9 \|\|
286	percent_G > 0.9 \|\|
287	percent_T > 0.9)
288	trash_code='L';
289	else if (percent_N >= 0.1 \|\| percent_4 >=0.1)
290	trash_code='N';
291	if (trash_code) {
292	++num_reads_chucked;
293	writePrepBam(wbam, read, next_id, trash_code);
294	continue;
295	}
296
297	if (wbam) {
298	writePrepBam(wbam, read, next_id);
299	}
300	else {
301	// daehwan - we should not use buf in printf function
302	// because it may contain some control characters such as "\" from quality values.
303	// Here, buf is only used for calculating the file offset
304	char buf[2048] = {0};
305	if (reads_format == FASTQ or (reads_format == FASTA && quals))
306	{
307	sprintf(buf, "@%u\n%s\n+%s\n%s\n",
308	next_id,
309	read.seq.c_str(),
310	read.name.c_str(),
311	read.qual.c_str());
312
313	fprintf(fout, "@%u\n%s\n+%s\n%s\n",
314	next_id,
315	read.seq.c_str(),
316	read.name.c_str(),
317	read.qual.c_str());
318	}
319	else if (reads_format == FASTA)
320	{
321	string qual;
322	if (color)
323	qual = string(read.seq.length()-1, 'I').c_str();
324	else
325	qual = string(read.seq.length(), 'I').c_str();
326
327	sprintf(buf, "@%u\n%s\n+%s\n%s\n",
328	next_id,
329	read.seq.c_str(),
330	read.name.c_str(),
331	qual.c_str());
332
333	fprintf(fout, "@%u\n%s\n+%s\n%s\n",
334	next_id,
335	read.seq.c_str(),
336	read.name.c_str(),
337	qual.c_str());
338	}
339	else
340	{
341	assert(0);
342	}
343
344	if (findex != NULL)
345	{
346	if ((next_id - num_reads_chucked) % INDEX_REC_COUNT == 0)
347	fprintf(findex, "%d\t%lu\n", next_id, file_offset);
348	}
349
350	file_offset += strlen(buf);
351	}
352	} //while !fr.isEof()
353	//fr.close();
354	//frq.close();
355	} //for each input file
356	fprintf(stderr, "%u out of %u reads have been filtered out\n",
357	num_reads_chucked, next_id);
358	if (readmap_loaded)
359	fprintf(stderr, "\t(%u filtered out due to %s)\n",
360	multimap_chucked, flt_reads.c_str());
361	if (wbam) { delete wbam; }
362	if (fout && fout!=stdout) fclose(fout);
363	if (fw!=NULL) {
364	fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
365	fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
366	fprintf(fw, "reads_in =%d\n",next_id);
367	fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
368	fclose(fw);
369	}
370
371	if (findex != NULL)
372	fclose(findex);
373	}
374
375	void print_usage()
376	{
377	fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
378	//alternate usage (--ctv_to_num) : doesn't filter out any reads,
379	//but simply convert read names to numeric ids
380	}
381
382
383	int main(int argc, char *argv[])
384	{
385	fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
386	fprintf(stderr, "---------------------------\n");
387
388	int parse_ret = parse_options(argc, argv, print_usage);
389	if (parse_ret)
390	return parse_ret;
391
392	if(optind >= argc)
393	{
394	print_usage();
395	return 1;
396	}
397
398	string reads_file_list = argv[optind++];
399
400	vector<string> reads_filenames;
401	tokenize(reads_file_list, ",",reads_filenames);
402	vector<string> quals_file_names;
403	vector<FZPipe> quals_files;
404	if (quals)
405	{
406	string quals_file_list = argv[optind++];
407	tokenize(quals_file_list, ",", quals_file_names);
408	for (size_t i = 0; i < quals_file_names.size(); ++i)
409	{
410	FZPipe seg_file(quals_file_names[i], true);
411	if (seg_file.file == NULL)
412	{
413	fprintf(stderr, "Error: cannot open reads file %s for reading\n",
414	quals_file_names[i].c_str());
415	exit(1);
416	}
417	quals_files.push_back(seg_file);
418	}
419	}
420	load_readmap();
421	// Only print to standard out the good reads
422	//TODO: a better, more generic read filtering protocol
423	if (flt_mappings.empty())
424	process_reads(reads_filenames, quals_files);
425	else //special use case: filter previous bowtie results
426	flt_reads_and_hits(reads_filenames);
427
428	return 0;
429	}