ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 4
Committed: Mon Mar 22 22:05:23 2010 UTC (14 years, 5 months ago) by gpertea
File size: 23716 byte(s)
Log Message:
initial commit

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5
6 #define USAGE "Usage:\n\
7 fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
8 [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9 \n\
10 Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
11 optionally collapse duplicates in the input fastq data.\n\
12 \n\
13 Options:\n\
14 -n rename all the reads using the <prefix> followed by a read counter;\n\
15 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
16 the read duplication count\n\
17 -o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
18 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
19 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
20 -3 trim the given adapter sequence at the 3' end of each read\n\
21 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
22 -l minimum \"clean\" length at the high quality 5'end that a read must\n\
23 have in order to pass the filter (default: 16)\n\
24 -r write a simple \"trash report\" file listing the discarded reads with a\n\
25 one letter code for the reason why they were trashed\n\
26 -C collapse duplicate reads and add a prefix with their count to the read\n\
27 name\n\
28 -D apply a low-complexity (dust) filter and discard any read that has at\n\
29 least half of it masked by this\n\
30 -Q quality values in the input data are interpreted as phred64, convert\n\
31 them to phred33\n\
32 "
33 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
34 FILE* f_out=NULL; //stdout if not provided
35 FILE* f_in=NULL; //input fastq (stdin if not provided)
36 FILE* freport=NULL;
37 bool debug=false;
38 bool doCollapse=false;
39 bool doDust=false;
40 bool trashReport=false;
41 bool rawFormat=false;
42 int min_read_len=16;
43 int dust_cutoff=16;
44 bool isfasta=false;
45 bool phred64=false;
46 GStr prefix;
47 GStr adapter3;
48 GStr adapter5;
49 int a3len=0;
50 int a5len=0;
51 // adaptor matching metrics -- for extendMatch() function
52 static const int a_m_score=2; //match score
53 static const int a_mis_score=-3; //mismatch
54 static const int a_dropoff_score=7;
55 static const int a_min_score=7;
56
57 // element in dhash:
58 class FqDupRec {
59 public:
60 int count; //how many of these reads are the same
61 int len; //length of qv
62 char* firstname; //optional, only if we want to keep the original read names
63 char* qv;
64 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
65 len=0;
66 qv=NULL;
67 firstname=NULL;
68 count=0;
69 if (qstr!=NULL) {
70 qv=Gstrdup(qstr->chars());
71 len=qstr->length();
72 count++;
73 }
74 if (rname!=NULL) firstname=Gstrdup(rname);
75 }
76 ~FqDupRec() {
77 GFREE(qv);
78 GFREE(firstname);
79 }
80 void add(GStr& d) { //collapse another record into this one
81 if (d.length()!=len)
82 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
83 count++;
84 for (int i=0;i<len;i++)
85 qv[i]=(qv[i]+d[i])/2;
86 }
87 };
88
89 void openfw(FILE* &f, GArgs& args, char opt) {
90 GStr s=args.getOpt(opt);
91 if (!s.is_empty()) {
92 if (s=='-') f=stdout;
93 else {
94 f=fopen(s,"w");
95 if (f==NULL) GError("Error creating file: %s\n", s.chars());
96 }
97 }
98 }
99
100 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
101 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
102
103 GHash<FqDupRec> dhash; //hash to keep track of duplicates
104
105 bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
106
107 int dust(GStr& seq);
108 bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
109 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
110
111 void convertQ64(char* q, int len);
112 void convertQ64(GStr& q);
113
114 int main(int argc, char * const argv[]) {
115 GArgs args(argc, argv, "QRDCl:d:3:5:n:r:o:");
116 int e;
117 int icounter=0; //counter for input reads
118 int outcounter=0; //counter for output reads
119 if ((e=args.isError())>0) {
120 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
121 exit(224);
122 }
123 debug=(args.getOpt('D')!=NULL);
124 phred64=(args.getOpt('Q')!=NULL);
125 doCollapse=(args.getOpt('C')!=NULL);
126 doDust=(args.getOpt('D')!=NULL);
127 rawFormat=(args.getOpt('R')!=NULL);
128 if (rawFormat) {
129 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
130 }
131 prefix=args.getOpt('n');
132 GStr s=args.getOpt('l');
133 if (!s.is_empty())
134 min_read_len=s.asInt();
135 s=args.getOpt('d');
136 if (!s.is_empty()) {
137 dust_cutoff=s.asInt();
138 doDust=true;
139 }
140
141 if (args.getOpt('3')!=NULL) {
142 adapter3=args.getOpt('3');
143 adapter3.upper();
144 a3len=adapter3.length();
145 }
146 if (args.getOpt('5')!=NULL) {
147 adapter5=args.getOpt('5');
148 adapter5.upper();
149 a5len=adapter5.length();
150 }
151 trashReport= (args.getOpt('r')!=NULL);
152 if (args.startNonOpt()==0) {
153 GMessage(USAGE);
154 exit(224);
155 }
156
157 openfw(f_out, args, 'o');
158 if (trashReport)
159 openfw(freport, args, 'r');
160 char* infile=NULL;
161 while ((infile=args.nextNonOpt())!=NULL) {
162 GStr infname(infile);
163 if (strcmp(infile,"-")==0) {
164 f_in=stdin; infname="stdin"; }
165 else {
166 f_in=fopen(infile,"r");
167 if (f_in==NULL)
168 GError("Cannot open input file %s!\n",infile);
169 }
170 GLineReader fq(f_in);
171 char* l=NULL;
172 while ((l=fq.getLine())!=NULL) {
173 GStr rname; //current read name
174 GStr rseq; //current read sequence
175 GStr rqv; //current read quality values
176 GStr s;
177 if (rawFormat) {
178 //TODO: implement qseq parsing here
179 //if (raw type=N) then continue; //skip invalid/bad records
180
181 } //raw qseq format
182 else { // FASTQ or FASTA
183 isfasta=(l[0]=='>');
184 if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
185 s=l;
186 rname=&(l[1]);
187 icounter++;
188 //GMessage("readname=%s\n",rname.chars());
189 for (int i=0;i<rname.length();i++)
190 if (rname[i]<=' ') { rname.cut(i); break; }
191 //now get the sequence
192 if ((l=fq.getLine())==NULL)
193 GError("Error: unexpected EOF after header for %s\n",rname.chars());
194 rseq=l; //this must be the DNA line
195 if (isfasta) {
196 while ((l=fq.getLine())!=NULL) {
197 //fasta can have multiple sequence lines
198 if (l[0]=='>') {
199 fq.pushBack();
200 break; //
201 }
202 rseq+=l;
203 }
204 } //multi-line fasta file
205 if (!isfasta) {
206 if ((l=fq.getLine())==NULL)
207 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
208 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
209 if ((l=fq.getLine())==NULL)
210 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
211 rqv=l;
212 if (rqv.length()!=rseq.length())
213 GError("Error: qv len != seq len for %s\n", rname.chars());
214 }
215 } //<-- FASTA or FASTQ
216 rseq.upper();
217 int l5=0;
218 int l3=rseq.length()-1;
219 if (l3-l5+1<min_read_len) {
220 if (trashReport) {
221 fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
222 }
223 continue;
224 }
225 if (ntrim(rseq, rqv, l5, l3)) {
226 if (l3-l5+1<min_read_len) {
227 if (trashReport) {
228 fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
229 }
230 continue; //invalid read
231 }
232 //-- keep only the l5..l3 range
233 rseq=rseq.substr(l5, l3-l5+1);
234 if (!rqv.is_empty())
235 rqv=rqv.substr(l5, l3-l5+1);
236 }
237
238 if (a3len>0) {
239 if (trim_adapter3(rseq, l5, l3)) {
240 if (l3-l5+1<min_read_len) {
241 if (trashReport) {
242 fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
243 }
244 continue;
245 }
246 //-- keep only the l5..l3 range
247 rseq=rseq.substr(l5, l3-l5+1);
248 if (!rqv.is_empty())
249 rqv=rqv.substr(l5, l3-l5+1);
250 }//some adapter was trimmed
251 } //adapter trimming
252 if (a5len>0) {
253 if (trim_adapter5(rseq, l5, l3)) {
254 if (l3-l5+1<min_read_len) {
255 if (trashReport) {
256 fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
257 }
258 continue;
259 }
260 //-- keep only the l5..l3 range
261 rseq=rseq.substr(l5, l3-l5+1);
262 if (!rqv.is_empty())
263 rqv=rqv.substr(l5, l3-l5+1);
264 }//some adapter was trimmed
265 } //adapter trimming
266 if (doCollapse) {
267 //keep read for later
268 FqDupRec* dr=dhash.Find(rseq.chars());
269 if (dr==NULL) { //new entry
270 //if (prefix.is_empty())
271 dhash.Add(rseq.chars(),
272 new FqDupRec(&rqv, rname.chars()));
273 //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
274 }
275 else
276 dr->add(rqv);
277 } //collapsing duplicates
278 else { //not collapsing duplicates
279 //do the dust filter now
280 if (doDust) {
281 int dustbases=dust(rseq);
282 if (dustbases>(rseq.length()>>1)) {
283 if (trashReport) {
284 fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
285 }
286 continue;
287 }
288 }
289 //print this record here
290 outcounter++;
291 if (isfasta) {
292 if (prefix.is_empty())
293 fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
294 else
295 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
296 rseq.chars());
297 }
298 else { //fastq
299 if (phred64) convertQ64(rqv);
300 if (prefix.is_empty())
301 fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
302 else
303 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
304 rseq.chars(),rqv.chars() );
305 }
306 } //not collapsing duplicates
307 } //for each fastq record
308 } //while each input file
309 FRCLOSE(f_in);
310 if (doCollapse) {
311 outcounter=0;
312 int maxdup_count=1;
313 char* maxdup_seq=NULL;
314 dhash.startIterate();
315 FqDupRec* qd=NULL;
316 char* seq=NULL;
317 while ((qd=dhash.NextData(seq))!=NULL) {
318 GStr rseq(seq);
319 //do the dusting here
320 if (doDust) {
321 int dustbases=dust(rseq);
322 if (dustbases>(rseq.length()>>1)) {
323 if (trashReport && qd->firstname!=NULL) {
324 fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
325 }
326 continue;
327 }
328 }
329 outcounter++;
330 if (qd->count>maxdup_count) {
331 maxdup_count=qd->count;
332 maxdup_seq=seq;
333 }
334 if (isfasta) {
335 if (prefix.is_empty()) {
336 fprintf(f_out, "@%s_x%d\n%s\n", qd->firstname, qd->count,
337 rseq.chars());
338 }
339 else { //use custom read name
340 fprintf(f_out, "@%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
341 qd->count, rseq.chars());
342 }
343 }
344 else { //fastq format
345 if (phred64) convertQ64(qd->qv, qd->len);
346 if (prefix.is_empty()) {
347 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
348 rseq.chars(), qd->qv);
349 }
350 else { //use custom read name
351 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
352 qd->count, rseq.chars(), qd->qv);
353 }
354 }
355 }//for each element of dhash
356 if (maxdup_count>1) {
357 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
358 }
359 } //report collapsed dhash entries
360 GMessage("Number of input reads: %9d\n", icounter);
361 GMessage(" Output records: %9d\n", outcounter);
362 if (trashReport) {
363 FWCLOSE(freport);
364 }
365
366 FWCLOSE(f_out);
367 }
368
369 class NData {
370 public:
371 int NPos[1024]; //there should be no reads longer than 1K ?
372 int NCount;
373 int end5;
374 int end3;
375 int seqlen;
376 double perc_N; //percentage of Ns in end5..end3 range only!
377 const char* seq;
378 bool valid;
379 NData() {
380 NCount=0;
381 end5=0;
382 end3=0;
383 seq=NULL;
384 perc_N=0;
385 valid=true;
386 }
387 void init(GStr& rseq) {
388 NCount=0;
389 perc_N=0;
390 seqlen=rseq.length();
391 seq=rseq.chars();
392 end5=1;
393 end3=seqlen;
394 for (int i=0;i<seqlen;i++)
395 if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
396 NPos[NCount]=i;
397 NCount++;
398 }
399 perc_N=(NCount*100.0)/seqlen;
400 valid=true;
401 }
402 void N_calc() { //only in the region after trimming
403 int n=0;
404 for (int i=end3-1;i<end5;i++) {
405 if (seq[i]=='N') n++;
406 }
407 perc_N=(n*100.0)/(end5-end3+1);
408 }
409 };
410
411 static NData feat;
412 int perc_lenN=12; // incremental distance from ends, in percentage of
413 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
414
415 void N_analyze(int l5, int l3, int p5, int p3) {
416 /* assumes feat was filled properly */
417 int old_dif, t5,t3,v;
418 if (l3-l5<min_read_len || p5>p3 ) {
419 feat.end5=l5+1;
420 feat.end3=l3+1;
421 return;
422 }
423
424 t5=feat.NPos[p5]-l5;
425 t3=l3-feat.NPos[p3];
426 old_dif=p3-p5;
427 v=(int)((((double)(l3-l5))*perc_lenN)/100);
428 if (v>20) v=20; /* enforce N-search limit for very long reads */
429 if (t5 < v ) {
430 l5=feat.NPos[p5]+1;
431 p5++;
432 }
433 if (t3 < v) {
434 l3=feat.NPos[p3]-1;
435 p3--;
436 }
437 /* restNs=p3-p5; number of Ns in the new CLR */
438 if (p3-p5==old_dif) { /* no change, return */
439 /* don't trim if this may shorten the read too much */
440 //if (l5-l3<min_read_len) return;
441 feat.end5=l5+1;
442 feat.end3=l3+1;
443 return;
444 }
445 else
446 N_analyze(l5,l3, p5,p3);
447 }
448
449
450 bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
451 //count Ns in the sequence
452 feat.init(rseq);
453 l5=feat.end5-1;
454 l3=feat.end3-1;
455 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
456 if (l5==feat.end5-1 && l3==feat.end3-1)
457 return false; //nothing changed
458 l5=feat.end5-1;
459 l3=feat.end3-1;
460 if (l3-l5+1<min_read_len) {
461 feat.valid=false;
462 return true;
463 }
464 feat.N_calc();
465 if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
466 feat.valid=false;
467 l3=l5+1;
468 return true;
469 }
470 return true;
471 }
472
473 //--------------- dust functions ----------------
474 class DNADuster {
475 public:
476 int dustword;
477 int dustwindow;
478 int dustwindow2;
479 int dustcutoff;
480 int mv, iv, jv;
481 int counts[32*32*32];
482 int iis[32*32*32];
483 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
484 dustword=wordsize;
485 dustwindow=winsize;
486 dustwindow2 = (winsize>>1);
487 dustcutoff=cutoff;
488 mv=0;
489 iv=0;
490 jv=0;
491 }
492 void setWindowSize(int value) {
493 dustwindow = value;
494 dustwindow2 = (dustwindow >> 1);
495 }
496 void setWordSize(int value) {
497 dustword=value;
498 }
499 void wo1(int len, const char* s, int ivv) {
500 int i, ii, j, v, t, n, n1, sum;
501 int js, nis;
502 n = 32 * 32 * 32;
503 n1 = n - 1;
504 nis = 0;
505 i = 0;
506 ii = 0;
507 sum = 0;
508 v = 0;
509 for (j=0; j < len; j++, s++) {
510 ii <<= 5;
511 if (*s<=32) {
512 i=0;
513 continue;
514 }
515 ii |= *s - 'A'; //assume uppercase!
516 ii &= n1;
517 i++;
518 if (i >= dustword) {
519 for (js=0; js < nis && iis[js] != ii; js++) ;
520 if (js == nis) {
521 iis[nis] = ii;
522 counts[ii] = 0;
523 nis++;
524 }
525 if ((t = counts[ii]) > 0) {
526 sum += t;
527 v = 10 * sum / j;
528 if (mv < v) {
529 mv = v;
530 iv = ivv;
531 jv = j;
532 }
533 }
534 counts[ii]++;
535 }
536 }
537 }
538
539 int wo(int len, const char* s, int* beg, int* end) {
540 int i, l1;
541 l1 = len - dustword + 1;
542 if (l1 < 0) {
543 *beg = 0;
544 *end = len - 1;
545 return 0;
546 }
547 mv = 0;
548 iv = 0;
549 jv = 0;
550 for (i=0; i < l1; i++) {
551 wo1(len-i, s+i, i);
552 }
553 *beg = iv;
554 *end = iv + jv;
555 return mv;
556 }
557
558 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
559 int i, j, l, a, b, v;
560 if (cutoff==0) cutoff=dustcutoff;
561 a=0;b=0;
562 //GMessage("Dust cutoff=%d\n", cutoff);
563 for (i=0; i < seqlen; i += dustwindow2) {
564 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
565 v = wo(l, seq+i, &a, &b);
566 if (v > cutoff) {
567 //for (j = a; j <= b && j < dustwindow2; j++) {
568 for (j = a; j <= b; j++) {
569 seqmsk[i+j]='N';//could be made lowercase instead
570 }
571 }
572 }
573 //return first;
574 }
575 };
576
577 static DNADuster duster;
578
579 int dust(GStr& rseq) {
580 char* seq=Gstrdup(rseq.chars());
581 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
582 //check the number of Ns:
583 int ncount=0;
584 for (int i=0;i<rseq.length();i++) {
585 if (seq[i]=='N') ncount++;
586 }
587 GFREE(seq);
588 return ncount;
589 }
590
591 // ------------------ adapter matching - triplet matching
592 //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
593 // or the last 9 nucleotides for the 5' adapter
594 //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
595 // check minimum score and
596 // for 3' adapter trimming:
597 // require that the right end of the alignment for either the adaptor OR the read must be
598 // < 3 distance from its right end
599 // for 5' adapter trimming:
600 // require that the left end of the alignment for either the adaptor OR the read must
601 // be at coordinate 0
602
603 bool extendMatch(const char* a, int alen, int ai,
604 const char* b, int blen, int bi, int mlen, int& l5, int& l3) {
605 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
606 //GStr dbg(a);
607 //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
608 int a_l=ai; //alignment coordinates on a
609 int a_r=ai+mlen-1;
610 int b_l=bi; //alignment coordinates on b
611 int b_r=bi+mlen-1;
612 int ai_maxscore=ai;
613 int bi_maxscore=bi;
614 int score=mlen*a_m_score;
615 int maxscore=score;
616 int mism5score=a_mis_score-2; //increase penalty for mismatches at 5' end
617 //try to extend to the left first, if possible
618 while (ai>0 && bi>0) {
619 ai--;
620 bi--;
621 score+= (a[ai]==b[bi])? a_m_score : mism5score;
622 if (score>maxscore) {
623 ai_maxscore=ai;
624 bi_maxscore=bi;
625 maxscore=score;
626 }
627 else if (maxscore-score>a_dropoff_score) break;
628 }
629 a_l=ai_maxscore;
630 b_l=bi_maxscore;
631 //now extend to the right
632 ai_maxscore=a_r;
633 bi_maxscore=b_r;
634 ai=a_r;
635 bi=b_r;
636 score=maxscore;
637 //sometimes there are extra AAAAs at the end of the read, ignore those
638 if (strcmp(&a[alen-4],"AAAA")==0) {
639 alen-=3;
640 while (a[alen-1]=='A' && alen>ai) alen--;
641 }
642 while (ai<alen-1 && bi<blen-1) {
643 ai++;
644 bi++;
645 //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
646 if (a[ai]==b[bi]) { //match
647 score+=a_m_score;
648 if (ai>=alen-2) {
649 score+=a_m_score-(alen-ai-1);
650 }
651 }
652 else { //mismatch
653 score+=a_mis_score;
654 }
655 if (score>maxscore) {
656 ai_maxscore=ai;
657 bi_maxscore=bi;
658 maxscore=score;
659 }
660 else if (maxscore-score>a_dropoff_score) break;
661 }
662 a_r=ai_maxscore;
663 b_r=bi_maxscore;
664 if (maxscore>=a_min_score && (alen-a_r<3 || blen-b_r<3 || a_l<2 || b_l<2)) {
665 if (a_l<alen-a_r-1) {
666 //adapter closer to the left end (typical for 5' adapter)
667 l5=a_r+1;
668 l3=alen-1;
669 }
670 else {
671 //adapter matching at the right end (typical for 3' adapter)
672 l5=0;
673 l3=a_l-1;
674 }
675 return true;
676 }
677 return false;
678 }
679
680 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
681 int rlen=seq.length();
682 l5=0;
683 l3=rlen-1;
684 //first try a full match, we might get lucky
685 int fi=-1;
686 if ((fi=seq.index(adapter3))>=0) {
687 if (fi<rlen-fi-a3len) {//match is closer to the right end
688 l5=fi+a3len;
689 l3=rlen-1;
690 }
691 else {
692 l5=0;
693 l3=fi-1;
694 }
695 return true;
696 }
697 //also, for fast detection of other adapter-only reads that start past
698 // the beginning of the adapter sequence, try to see if the first a3len-4
699 // characters of the read are a substring of the adapter
700 GStr rstart=seq.substr(0,a3len-4);
701 if ((fi=adapter3.index(rstart))>=0) {
702 l3=rlen-1;
703 l5=a3len-4;
704 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
705 return true;
706 }
707
708 //another easy case: first half of the adaptor matches
709 int a3hlen=a3len>>1;
710 GStr ahalf=adapter3.substr(0, a3hlen);
711 if ((fi=seq.index(ahalf))>=0) {
712 extendMatch(seq.chars(), rlen, fi,
713 adapter3.chars(), a3len, 0, a3hlen, l5,l3);
714 return true;
715 }
716 //no easy cases, so let's do the word hashing
717 for (int iw=0;iw<6;iw++) {
718 GStr aword=adapter3.substr(iw,3);
719 if ((fi=seq.index(aword))>=0 && rlen-fi>3) {
720 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
721 a3len, iw, 3, l5,l3)) return true;
722 }
723 }
724 return false; //no adapter parts found
725 }
726
727 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
728 int rlen=seq.length();
729 l5=0;
730 l3=rlen-1;
731 //try to see if adapter is fully included in the read
732 int fi=-1;
733 if ((fi=seq.index(adapter5))>=0) {
734 if (fi<rlen-fi-a5len) {//match is closer to the right end
735 l5=fi+a5len;
736 l3=rlen-1;
737 }
738 else {
739 l5=0;
740 l3=fi-1;
741 }
742 return true;
743 }
744 //for fast detection of adapter-rich reads, check if the first 12
745 //characters of the read are a substring of the adapter
746 GStr rstart=seq.substr(1,12);
747 if ((fi=adapter5.index(rstart))>=0) {
748 //l3=rlen-1;
749 //l5=a5len-4;
750 //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
751 //return true;
752 extendMatch(seq.chars(), rlen, 1,
753 adapter5.chars(), a5len, fi, 12, l5,l3);
754 return true;
755 }
756
757 //another easy case: last 12 characters of the adaptor found as a substring of the read
758 int aplen=12;
759 int apstart=a5len-aplen-2;
760 if (apstart<0) { apstart=0; aplen=a5len-1; }
761 GStr bstr=adapter5.substr(apstart, aplen);
762 if ((fi=seq.index(bstr))>=0) {
763 extendMatch(seq.chars(), rlen, fi,
764 adapter5.chars(), a5len, apstart, aplen, l5,l3);
765 return true;
766 }
767 //no easy cases, find a triplet match as a seed for alignment extension
768 //find triplets at the right end of the adapter
769 for (int iw=0;iw<6;iw++) {
770 apstart=a5len-iw-4;
771 GStr aword=adapter5.substr(apstart,3);
772 if ((fi=seq.index(aword))>=0) {
773 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
774 a5len, apstart, 3, l5,l3)) {
775 return true;
776 }
777 }
778 }
779 return false; //no adapter parts found
780 }
781
782 //conversion of phred64 to phread33
783 void convertQ64(GStr& q) {
784 for (int i=0;i<q.length();i++) q[i]-=31;
785 }
786
787 void convertQ64(char* q, int len) {
788 for (int i=0;i<len;i++) q[i]-=31;
789 }