gclib/tophat_cpp/prep_reads.cpp

/*
 *  polyA_reads.cpp
 *  TopHat
 *
 *  Created by Cole Trapnell on 9/2/08.
 *  Copyright 2008 Cole Trapnell. All rights reserved.
 *      Derived from maq "catfilter", by Heng Li at Sanger
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <cassert>
#include <vector>
#include <cstring>
#include <cstdlib>

#include "common.h"
#include "reads.h"
#include "tokenize.h"
#include "qual.h"

//bool fastq_db = true;

using namespace std;

void format_qual_string(string& qual_str)
{
  for (size_t i = 0; i < qual_str.size(); ++i)
    {
      qual_str[i] = charToPhred33(qual_str[i], 
                                      solexa_quals, 
                                      phred64_quals);
    }
}


void filter_garbage_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
{       

  int num_reads_chucked = 0, num_reads = 0;
  int min_read_len = 2000000;
  int max_read_len = 0;
  int next_id = 0;
  FILE* fw=NULL;
  if (!aux_outfile.empty()) {
    fw=fopen(aux_outfile.c_str(), "w");
    if (fw==NULL)
       err_die("Error: cannot create file %s\n",aux_outfile.c_str());
    }

  for (size_t fi = 0; fi < reads_files.size(); ++fi)
    {
      Read read;
      FLineReader fr(reads_files[fi]);
      skip_lines(fr);
      
      FZPipe fq;
      if (quals)
            fq = quals_files[fi];
      FLineReader frq(fq);
      skip_lines(frq);
      
      while (!fr.isEof()) {
            //read.clear();
            // Get the next read from the file
        if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
            break;
        if (read.seq.length()<12) {
            ++num_reads_chucked;
            continue;
            }
        if ((int)read.seq.length()<min_read_len) min_read_len=read.seq.length();
        if ((int)read.seq.length()>max_read_len) max_read_len=read.seq.length();
            format_qual_string(read.qual);
        std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
        ++num_reads;
        ++next_id;
        char counts[256];
        memset(counts, 0, sizeof(counts));
        // Count up the bad characters
        for (unsigned int i = 0; i != read.seq.length(); ++i)
          {
            char c = (char)toupper(read.seq[i]);
            counts[(size_t)c]++;
          }

        double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
        double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
        double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
        double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
        double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
        double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();

        if (reads_format == FASTQ &&
               read.seq.length() != read.qual.length())
          {
            ++num_reads_chucked;
            continue;
          }

        // daehwan - check this later, it's due to bowtie
        if (color && read.seq[1] == '4') {
          continue;
          }
        // Chuck the read if there are at least 5 'N's or if it's mostly
        // (>90%) 'N's and 'A's

        if (percent_A > 0.9 ||
            percent_C > 0.9 ||
            percent_G > 0.9 ||
            percent_T > 0.9 ||
            percent_N >= 0.1 ||
            percent_4 >= 0.1)
          {
            ++num_reads_chucked;
          }
        else
          {
         /*   if (!fastq_db)
          {
            if (reads_format == FASTQ  or (reads_format == FASTA && quals))
              printf("@%s\n%s\n+\n%s\n",
                 read.name.c_str(), read.seq.c_str(),read.qual.c_str());
            else if (reads_format == FASTA)
              printf(">%s\n%s\n", read.name.c_str(), read.seq.c_str());
          }
            else
          { */
            if (reads_format == FASTQ or (reads_format == FASTA && quals))
              {
                printf("@%d\n%s\n+%s\n%s\n",
                   next_id,
                   read.seq.c_str(),
                   read.name.c_str(),
                   read.qual.c_str());
              }
            else if (reads_format == FASTA)
              {
                string qual;
                if (color) {
                  qual = "!";
                  qual += string(read.seq.length()-1, 'I').c_str();
                  }
                else
              qual = string(read.seq.length(), 'I').c_str();

                printf("@%d\n%s\n+%s\n%s\n",
                   next_id,
                   read.seq.c_str(),
                   read.name.c_str(),
                   qual.c_str());
              }
          //} only used if fastq_db is false (?)
          }
      } //while !fr.isEof()
    fr.close();
    frq.close();
    } //for each input file
  
  fprintf(stderr, "%d out of %d reads have been filtered out\n", 
          num_reads_chucked, num_reads);
  if (fw!=NULL) {
    fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
    fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
    fprintf(fw, "reads_in =%d\n",num_reads);
    fprintf(fw, "reads_out=%d\n",num_reads-num_reads_chucked);
    fclose(fw);
    }
}

void print_usage()
{
  fprintf(stderr, "Usage:   prep_reads <reads1.fa/fq,...,readsN.fa/fq>\n");
}


int main(int argc, char *argv[])
{
  fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION); 
  fprintf(stderr, "---------------------------\n");
  
  int parse_ret = parse_options(argc, argv, print_usage);
  if (parse_ret)
    return parse_ret;
  
  if(optind >= argc)
    {
      print_usage();
      return 1;
    }
  
  string reads_file_list = argv[optind++];
  
  vector<string> reads_file_names;
  vector<FZPipe> reads_files;
  tokenize(reads_file_list, ",",reads_file_names);
  for (size_t i = 0; i < reads_file_names.size(); ++i)
    {
      string fname=reads_file_names[i];
      string pipecmd=guess_packer(fname, true);
      if (!pipecmd.empty()) pipecmd.append(" -cd");
      FZPipe seg_file(fname,pipecmd);
      if (seg_file.file == NULL) {
          fprintf(stderr, "Error: cannot open reads file %s for reading\n",
              reads_file_names[i].c_str());
              exit(1);
          }
      reads_files.push_back(seg_file);
    }
  
  vector<string> quals_file_names;
  vector<FZPipe> quals_files;
  if (quals)
    {
      string quals_file_list = argv[optind++];
      tokenize(quals_file_list, ",", quals_file_names);
      for (size_t i = 0; i < quals_file_names.size(); ++i)
        {
      string fname(quals_file_names[i]);
      string pipecmd=guess_packer(fname, true);
      FZPipe seg_file(fname, pipecmd);
          if (seg_file.file == NULL)
            {
              fprintf(stderr, "Error: cannot open reads file %s for reading\n",
                      quals_file_names[i].c_str());
              exit(1);
              }
          quals_files.push_back(seg_file);
        }
    }
  
  // Only print to standard out the good reads
  //TODO: a better, more generic read filtering protocol
  filter_garbage_reads(reads_files, quals_files);
    
  return 0;
}
Revision:	68
Committed:	Mon Sep 19 20:09:26 2011 UTC (13 years ago) by gpertea
File size:	6934 byte(s)
Log Message:	first attempt at accepting different Color qvlen in prep_reads
Line	File contents
1	/*
2	* polyA_reads.cpp
3	* TopHat
4	*
5	* Created by Cole Trapnell on 9/2/08.
6	* Copyright 2008 Cole Trapnell. All rights reserved.
7	* Derived from maq "catfilter", by Heng Li at Sanger
8	*/
9
10	#ifdef HAVE_CONFIG_H
11	#include <config.h>
12	#endif
13
14	#include <stdio.h>
15	#include <cassert>
16	#include <vector>
17	#include <cstring>
18	#include <cstdlib>
19
20	#include "common.h"
21	#include "reads.h"
22	#include "tokenize.h"
23	#include "qual.h"
24
25	//bool fastq_db = true;
26
27	using namespace std;
28
29	void format_qual_string(string& qual_str)
30	{
31	for (size_t i = 0; i < qual_str.size(); ++i)
32	{
33	qual_str[i] = charToPhred33(qual_str[i],
34	solexa_quals,
35	phred64_quals);
36	}
37	}
38
39
40	void filter_garbage_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
41	{
42
43	int num_reads_chucked = 0, num_reads = 0;
44	int min_read_len = 2000000;
45	int max_read_len = 0;
46	int next_id = 0;
47	FILE* fw=NULL;
48	if (!aux_outfile.empty()) {
49	fw=fopen(aux_outfile.c_str(), "w");
50	if (fw==NULL)
51	err_die("Error: cannot create file %s\n",aux_outfile.c_str());
52	}
53
54	for (size_t fi = 0; fi < reads_files.size(); ++fi)
55	{
56	Read read;
57	FLineReader fr(reads_files[fi]);
58	skip_lines(fr);
59
60	FZPipe fq;
61	if (quals)
62	fq = quals_files[fi];
63	FLineReader frq(fq);
64	skip_lines(frq);
65
66	while (!fr.isEof()) {
67	//read.clear();
68	// Get the next read from the file
69	if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
70	break;
71	if (read.seq.length()<12) {
72	++num_reads_chucked;
73	continue;
74	}
75	if ((int)read.seq.length()<min_read_len) min_read_len=read.seq.length();
76	if ((int)read.seq.length()>max_read_len) max_read_len=read.seq.length();
77	format_qual_string(read.qual);
78	std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
79	++num_reads;
80	++next_id;
81	char counts[256];
82	memset(counts, 0, sizeof(counts));
83	// Count up the bad characters
84	for (unsigned int i = 0; i != read.seq.length(); ++i)
85	{
86	char c = (char)toupper(read.seq[i]);
87	counts[(size_t)c]++;
88	}
89
90	double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
91	double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
92	double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
93	double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
94	double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
95	double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
96
97	if (reads_format == FASTQ &&
98	read.seq.length() != read.qual.length())
99	{
100	++num_reads_chucked;
101	continue;
102	}
103
104	// daehwan - check this later, it's due to bowtie
105	if (color && read.seq[1] == '4') {
106	continue;
107	}
108	// Chuck the read if there are at least 5 'N's or if it's mostly
109	// (>90%) 'N's and 'A's
110
111	if (percent_A > 0.9 \|\|
112	percent_C > 0.9 \|\|
113	percent_G > 0.9 \|\|
114	percent_T > 0.9 \|\|
115	percent_N >= 0.1 \|\|
116	percent_4 >= 0.1)
117	{
118	++num_reads_chucked;
119	}
120	else
121	{
122	/* if (!fastq_db)
123	{
124	if (reads_format == FASTQ or (reads_format == FASTA && quals))
125	printf("@%s\n%s\n+\n%s\n",
126	read.name.c_str(), read.seq.c_str(),read.qual.c_str());
127	else if (reads_format == FASTA)
128	printf(">%s\n%s\n", read.name.c_str(), read.seq.c_str());
129	}
130	else
131	{ */
132	if (reads_format == FASTQ or (reads_format == FASTA && quals))
133	{
134	printf("@%d\n%s\n+%s\n%s\n",
135	next_id,
136	read.seq.c_str(),
137	read.name.c_str(),
138	read.qual.c_str());
139	}
140	else if (reads_format == FASTA)
141	{
142	string qual;
143	if (color) {
144	qual = "!";
145	qual += string(read.seq.length()-1, 'I').c_str();
146	}
147	else
148	qual = string(read.seq.length(), 'I').c_str();
149
150	printf("@%d\n%s\n+%s\n%s\n",
151	next_id,
152	read.seq.c_str(),
153	read.name.c_str(),
154	qual.c_str());
155	}
156	//} only used if fastq_db is false (?)
157	}
158	} //while !fr.isEof()
159	fr.close();
160	frq.close();
161	} //for each input file
162
163	fprintf(stderr, "%d out of %d reads have been filtered out\n",
164	num_reads_chucked, num_reads);
165	if (fw!=NULL) {
166	fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
167	fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
168	fprintf(fw, "reads_in =%d\n",num_reads);
169	fprintf(fw, "reads_out=%d\n",num_reads-num_reads_chucked);
170	fclose(fw);
171	}
172	}
173
174	void print_usage()
175	{
176	fprintf(stderr, "Usage: prep_reads <reads1.fa/fq,...,readsN.fa/fq>\n");
177	}
178
179
180	int main(int argc, char *argv[])
181	{
182	fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
183	fprintf(stderr, "---------------------------\n");
184
185	int parse_ret = parse_options(argc, argv, print_usage);
186	if (parse_ret)
187	return parse_ret;
188
189	if(optind >= argc)
190	{
191	print_usage();
192	return 1;
193	}
194
195	string reads_file_list = argv[optind++];
196
197	vector<string> reads_file_names;
198	vector<FZPipe> reads_files;
199	tokenize(reads_file_list, ",",reads_file_names);
200	for (size_t i = 0; i < reads_file_names.size(); ++i)
201	{
202	string fname=reads_file_names[i];
203	string pipecmd=guess_packer(fname, true);
204	if (!pipecmd.empty()) pipecmd.append(" -cd");
205	FZPipe seg_file(fname,pipecmd);
206	if (seg_file.file == NULL) {
207	fprintf(stderr, "Error: cannot open reads file %s for reading\n",
208	reads_file_names[i].c_str());
209	exit(1);
210	}
211	reads_files.push_back(seg_file);
212	}
213
214	vector<string> quals_file_names;
215	vector<FZPipe> quals_files;
216	if (quals)
217	{
218	string quals_file_list = argv[optind++];
219	tokenize(quals_file_list, ",", quals_file_names);
220	for (size_t i = 0; i < quals_file_names.size(); ++i)
221	{
222	string fname(quals_file_names[i]);
223	string pipecmd=guess_packer(fname, true);
224	FZPipe seg_file(fname, pipecmd);
225	if (seg_file.file == NULL)
226	{
227	fprintf(stderr, "Error: cannot open reads file %s for reading\n",
228	quals_file_names[i].c_str());
229	exit(1);
230	}
231	quals_files.push_back(seg_file);
232	}
233	}
234
235	// Only print to standard out the good reads
236	//TODO: a better, more generic read filtering protocol
237	filter_garbage_reads(reads_files, quals_files);
238
239	return 0;
240	}