ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/trimpoly.c
Revision: 14
Committed: Mon Jul 18 20:36:48 2011 UTC (13 years, 2 months ago) by gpertea
File size: 26850 byte(s)
Log Message:
added old trimpoly code just for archive

Line File contents
1 /* takes EST sequences from stdin only
2 */
3
4 #include <stdlib.h>
5 #include <stdio.h>
6 #include <string.h>
7 #include "getopt.h"
8 #include "trimpoly.h"
9 #include "color_c.h"
10 #include "ctype.h"
11
12 #define USAGE "Usage: \n\
13 trimpoly [-h] [-n <percN>] [-z <min_read_len>] [-y <Nperc>] [-m <percdist>] \\ \n\
14 [-PN] [-e <end>] [-r <minrun>] [-s <minscore>] [-u <maxgap>] \\ \n\
15 [-l <percdist>|-L <range>] [-X][-D] \n\
16 \n\
17 Trims polyA/polyT and Ns from the end of the FASTA DNA sequences read\n\
18 from standard input.\n\
19 Outputs the following columns, tab-delimited:\n\
20 seq_name, perc_N, end5, end3, initial_length \n\
21 \n\
22 -n try the N-trimming if percent of N is above <percN>\n\
23 this option MUST be present if you want to enable N-trimming!\n\
24 Unless -A or -N switches are present, N trimming is done before \n\
25 polyA/T trimming\n\
26 -y try to extract from a sequence the maximum range having \n\
27 percent of N lower than <Nperc> (default: 3.00)\n\
28 -m <percdist> - incremental distance from ends, in percentage of\n\
29 sequence length, where N-trimming is done (default:12) \n\
30 The resulting distance is auto-limited to 30nt. max\n\
31 -N if -n flag was specified, do only N-trimming (skip the \n\
32 polyA/T trimming)\n\
33 -P do the polyA/T trimming first and then the N trimming;\n\
34 only makes sense if -n option is present.\n\
35 Poly A/T trimming:\n\
36 -e <end> restrict the poly trimming to one end only or to the \n\
37 maximum scoring poly match\n\
38 (default is: trim both ends)\n\
39 if <end> is 3 - trim only polyA from end3\n\
40 if <end> is 5 - trim only polyT from end5\n\
41 if <end> is x - only the maximum scoring of end3/end5 will be trimmed\n\
42 (if -X option is used, polyT/polyA will be swapped)\n\
43 -s <minscore> minimum score for poly A/T trimming (default 7)\n\
44 it is dynamically increased by 1 for every 10nt distance from the ends\n\
45 (scoring: +1 for each A/T, -2 for each intervening nucleotide) \n\
46 -r <minrun> minimum count of consecutive A/T (default 5)\n\
47 to start extending and computing score from\n\
48 -u <extstop> the count of non-matching nucleotides\n\
49 that will determine a stop in extending a match (default 4)\n\
50 -l incremental distance from ends, in percentage of length,\n\
51 where polyA/T trimming is allowed (default:15)\n\
52 -L fixed distance (in nucleotides) from both ends where\n\
53 polyA/T trimming is allowed (overrides -l)\n\
54 -o <file> outputs a multi FASTA file containing only the trimmed\n\
55 sequences \n\
56 -R relative shifting for end3, end5 will be displayed \n\
57 in the tabbed output instead of absolute coordinates\n\
58 -H parse starting end5, end3 coordinates from the FASTA file \n\
59 (expect to find them after sequence name, tab delimited)\n\
60 -X inverted trimming: trim polyA from end5 and polyT from end3\n\
61 -Y try inverted trimming if no poly(A/T) was trimmed in the first pass\n\
62 -D debug display mode: sequences are shown trimmed, colored\n\
63 using console codes (trimmed parts have a red background)\n"
64
65 #define MIN_PERC_LEN 30
66 /*
67 trimpoly -n2 -m5
68 seems to be a good way to trim ESTs with possible 'dirty' ends
69 */
70
71 typedef struct feature_t {
72 char* seq_name;
73 char* defline;
74 char* seq; /* sequence, upper case, spaces, tabs, newlines are removed */
75 int seqlen;
76 double perc_N;
77 int end5;
78 int end3;
79 int* NPos; /* position of Ns in the sequence - dynamically alocated array */
80 int NCount;
81 } feature_t;
82
83 struct feature_t feat; /* current sequence data being analysed */
84
85 /* global options/flags: */
86 int min_read_len=32;
87 int skip_poly=0; /* -N */
88 int poly_first=0; /* -P */
89 int perc_lenN=12; /* -m */
90 double perc_N=0; /* -n */
91 double nsave_limit=3.00; /* -y */
92 int trim_end=0; /* -e */
93 int min_score=7; /* -s */
94 int min_run=5; /* -r */
95 int max_gap=4; /* -u */
96 int perc_len=15; /* -l */
97 int swap_poly=0; /* -X, -Y */
98 int c_show=0; /* -D */
99 char outfile[256]; /* -o */
100 int do_write=0; /* -o */
101 int relative_shift=0; /* -R */
102 int init_ends=0; /* -H */
103 int fixed_len=0;
104 char* seed_5=SEED_T;
105 char* seed_3=SEED_A;
106 char chr_5='T';
107 char chr_3='A';
108 int first_pass=1;
109
110 void optErr(char opt);
111 void showUsage();
112 char *seq_copy(char *dest, char *src, int *seqsize);
113 char* lstrstr(char* lstart, char *rend, char* substr);
114 char* rstrstr(char* rstart, char *lend, char* substr);
115
116 /* these funcs are acting directly on feat structure */
117 int analyse3();
118 int analyse5();
119 void N_trim();
120 void N_analyse(int l5, int l3, int p5, int p3);
121 void N_save(int l5, int l3);
122 double N_calc(int e5, int e3, int* percN);
123
124 int main(int argc, char * const argv[]) {
125 int c,wpos, init_end5=0, init_end3=0, score1=0, score2=0, save5, save3;
126 int alloc_size;
127 int seq_size=0, readlen;
128 char *p, *p_end, *bufpos;
129 char seq_name[256];
130 char def_line[8192];
131 char line[121];
132 FILE * fout=NULL;
133 char inbuf[BUF_LEN]; /* incoming buffer for sequence lines. */
134 char * seq; /* working buffer */
135 alloc_size=BUF_LEN;
136 seq=malloc(BUF_LEN); /* initial allocation for sequence buffer */
137 while ((c=getopt(argc, argv, "hNPXYDRHz:m:n:e:r:s:l:L:o:u:y:")) > 0) {
138 switch (c)
139 { case 'N': skip_poly=1;break;
140 case 'P': poly_first=1;break;
141 case 'm': perc_lenN=strtol(optarg, (char **)NULL, 10);
142 if (perc_lenN==0) optErr('m');
143 break;
144 case 'z': min_read_len=strtol(optarg, (char **)NULL, 10);
145 if (min_read_len<5) optErr('z');
146 break;
147 case 'n': perc_N=strtod(optarg, (char **)NULL);
148 if (perc_N==0) optErr('n');
149 break;
150 case 'y': nsave_limit=strtod(optarg, (char **)NULL);
151 if (nsave_limit==0) optErr('y');
152 break;
153 case 'e': if (optarg[0]=='x') {
154 trim_end=1;
155 break;
156 }
157 trim_end=strtol(optarg, (char **)NULL, 10);
158 if ((trim_end!=3 && trim_end!=5) || trim_end==0)
159 optErr('e');
160 break;
161 case 'r': min_run=strtol(optarg, (char **)NULL, 10);
162 if (min_run<=1) optErr('r');
163 break;
164 case 'u': max_gap=strtol(optarg, (char **)NULL, 10);
165 if (max_gap<=1) optErr('u');
166 break;
167 case 's': min_score=strtol(optarg, (char **)NULL, 10);
168 if (min_score==0) optErr('s');
169 break;
170 case 'l': perc_len=strtol(optarg, (char **)NULL, 10);
171 if (perc_len==0) optErr('l');
172 break;
173 case 'L': fixed_len=strtol(optarg, (char **)NULL, 10);
174 if (fixed_len==0) optErr('L');
175 break;
176 case 'o': strcpy(outfile, optarg);
177 if (strlen(outfile)==0)
178 optErr('o');
179 else do_write=1;
180 break;
181 case 'X': swap_poly=1;break;
182 case 'Y': swap_poly=2;break;
183 case 'H': init_ends=1;break;
184 case 'D': c_show=1;break;
185 case 'h': showUsage();exit(1);
186 case 'R': relative_shift=1;
187 }
188 } /* while options */
189
190 if (swap_poly==1) {
191 seed_5=SEED_A;
192 seed_3=SEED_T;
193 chr_5='A';
194 chr_3='T';
195 }
196 bufpos=seq;
197 if (do_write) {
198 if ((fout=fopen(outfile,"w"))==NULL) optErr('o');
199 }
200 for (;;) {
201 p_end=fgets(inbuf, BUF_LEN-1,stdin);
202 /* basic sanity check: */
203 /*if (!p_end) { //wtf was I thinking? what if it's eof?
204 p=strchr(inbuf, '\n');
205 if (!p || *(p+1)!='\0') {
206 fprintf(stderr, "fgets error! Line too long (%s)?!\n",inbuf);
207 exit(1);
208 }
209 }*/
210 /* is this a sequence name? */
211 p=strchr(inbuf, '>');
212 if (p || !p_end) { /* starts new sequence, accumulated old one */
213 if (bufpos!=seq) { /* close and analyse existing previous sequence */
214 /* printf("Final: seqsize=%d\n",seq_size); */
215 feat.seq=seq;
216 feat.seqlen=seq_size;
217 feat.seq_name=seq_name;
218 if (init_ends) {/* parsed initial coordinates */
219 feat.end5=init_end5;
220 feat.end3=init_end3;
221 }
222 else {
223 feat.end5=1;
224 feat.end3=feat.seqlen;
225 }
226 if (skip_poly)
227 N_trim(); /* only test for N_trim */
228 else {
229 if (poly_first==0) N_trim();
230 save5=feat.end5;
231 save3=feat.end3;
232 if (trim_end!=5) score1=analyse3();
233 if (trim_end!=3) score2=analyse5();
234 if (swap_poly==2 && save5==feat.end5 &&
235 save3==feat.end3) {
236 /* printf("here: Score1=%d, Score2=%d\n", score1, score2); */
237 seed_5=SEED_A;
238 seed_3=SEED_T;
239 chr_5='A';
240 chr_3='T';
241 if (trim_end!=5) score1=analyse3();
242 if (trim_end!=3) score2=analyse5();
243 seed_5=SEED_T;
244 seed_3=SEED_A;
245 chr_5='T';
246 chr_3='A';
247 }
248 /* printf("Score1=%d, Score2=%d\n", score1, score2); */
249 if (trim_end==1) {
250 /* printf("score5=%d, score3=%d\n", score2,score1); */
251 if (score1<score2) feat.end3=save3;
252 else feat.end5=save5;
253 }
254 if (poly_first!=0) N_trim();
255 }
256 if (relative_shift) {
257 if (init_ends)
258 printf("%s\t\%4.2f\t%d\t%d\t%d\n",
259 seq_name, feat.perc_N, feat.end5-init_end5,init_end3-feat.end3,
260 feat.seqlen);
261 else
262 printf("%s\t\%4.2f\t%d\t%d\t%d\n",
263 seq_name, feat.perc_N, feat.end5-1,feat.seqlen-feat.end3,
264 feat.seqlen);
265 }
266 else
267 printf("%s\t\%4.2f\t%d\t%d\t%d\n",
268 seq_name, feat.perc_N, feat.end5,feat.end3,
269 feat.seqlen);
270 if (c_show)
271 c_print_trim(seq, feat.end5, feat.end3);
272
273 if (do_write && feat.perc_N<3.00 && feat.end3-feat.end5>=min_read_len-1) {
274 /* write only those having length>=100 and perc_N<3*/
275 if (init_ends) fprintf(fout,">%s\n",seq_name);
276 else fprintf(fout,"%s",def_line);
277 wpos=feat.end5-1;
278 while (feat.end3-wpos>=60) {
279 strncpy(line,seq+wpos,60);line[60]='\0';
280 fprintf(fout,"%s\n",line);
281 wpos+=60;
282 }
283 if (feat.end3-wpos>0) {
284 strncpy(line,seq+wpos,feat.end3-wpos);
285 line[feat.end3-wpos]='\0';
286 fprintf(fout,"%s\n",line);
287 }
288 }
289 }
290 if (!p_end) break; /* exit on end-of-file */
291 /* parse sequence name and, eventually, init coordinates */
292 strcpy(def_line, p); /* keep defline there for -o output */
293 for (p_end=p+1;(*p_end)!='\0' && !isspace(*p_end); p_end++);
294 strncpy(seq_name, p+1,p_end-p-1);
295 seq_name[p_end-p-1]='\0';
296 if (init_ends) { /* tab delimited fields in def_line*/
297 /* after p_end (\t) there are init values */
298 p_end=strchr(p,'\t');
299 if (p_end==NULL) optErr('H');
300 p_end++;
301 p=strchr(p_end,'\t');
302 if (p==NULL) optErr('H');
303 *p='\0';
304 init_end5=strtol(p_end, (char **)NULL, 10);
305 if (init_end5==0) optErr('H');
306 p++;
307 p_end=strchr(p,'\n');
308 if (!p_end) p_end=strchr(p,'\t');
309 if (p_end==NULL) optErr('H');
310 *p_end='\0';
311 init_end3=strtol(p, (char **)NULL, 10);
312 if (init_end3==0 || init_end3<init_end5) optErr('H');
313 }
314 seq_size=0;
315 bufpos=seq; /* no need to reallocate the buffer for each sequence..
316 let it stay at maximum all along */
317 }
318 else { /* extending current sequence : */
319 readlen=strlen(inbuf);
320 if (seq_size+readlen>alloc_size) { /* must realloc, it's gonna blow */
321 alloc_size+=BUF_LEN;
322 seq=realloc(seq, alloc_size);
323 /* printf("Realloc! seqsize=%d, readlen=%d\n",seq_size, readlen); */
324 bufpos=seq+seq_size;
325 }
326 bufpos=seq_copy(bufpos,inbuf, &seq_size);
327 /* append, removing newline chars and incrementing seq_size accordingly */
328 }
329 }/* for input */
330 free(seq);
331 if (do_write) fclose(fout);
332 return 0;
333 }
334
335 char* seq_copy(char *dest, char *src, int *seq_size) {
336 /* assume destination already has room for all the chars in src!
337 skip any blanks or newlines on copying
338 RETURN: the position immediately after the last char copied!
339 */
340 int i,j;
341 j=0;
342 for (i=0;(src[i]!='\0');i++) {
343 if (src[i]==' ' || src[i]=='\n'|| src[i]=='\t') continue;
344 /* dest[j]=toupper(src[i]);j++; */
345 dest[j]=src[i];j++;
346 }
347 dest[j]='\0';
348 (*seq_size) += j;
349 return &dest[j];
350 }
351
352
353
354 /*********************************************************************/
355 /* Helper functions */
356 /*********************************************************************/
357
358
359
360 void optErr(char opt) {
361 showUsage();
362 fprintf(stderr, "\nIncorrect value specified for -%c option!\n", opt);
363 exit(1);
364 }
365
366 void showUsage() {
367 fprintf(stderr, USAGE);
368 }
369
370
371 char* lstrstr(char* lstart, char *rend, char* substr) { /*like strstr, but starts searching
372 from lstart, going up to rend and returns a pointer to the first (left)
373 matching character in str */
374 char *p;
375 int l,i;
376 l=strlen(substr);
377 p=lstart;
378 while (p+l<rend) {
379 for (i=0;i<l;i++) if (toupper(*(p+i)) != toupper(*(substr+i))) break;
380 if (i==l) return p;
381 p++;
382 }
383 return NULL;
384 }
385
386
387 char* rstrstr(char* rstart, char *lend, char* substr) { /*like strstr, but starts searching
388 from right end, going up to lend and returns a pointer to the last (right)
389 matching character in str */
390 char *p;
391 int l,i;
392 l=strlen(substr);
393 p=rstart-l+1;
394 while (p>=lend) {
395 for (i=0;i<l;i++) if (toupper(*(p+i)) != toupper(*(substr+i))) break;
396 if (i==l) return p+l-1;
397 p--;
398 }
399 return NULL;
400 }
401
402 /*=================================================*/
403
404 int analyse5() { /*== end5 trim (polyT if no -X) ======*/
405 int gaplen, locmaxpos,
406 locmaxscore, locminscore, run, locmaxrun, v;
407 char *p;
408 int score;
409 int first_pass=0;
410 int l5=0; /* offset from current end5 */
411 int norange=1;
412 int maxhit_score=0;
413 int maxhit_pos=0;
414 int startpos=0;
415 int pos5=l5; /* distance to end5 */
416 char *seqstart=feat.seq+feat.end5-1;
417 char *seqend=feat.seq+feat.end3-1;
418 while (pos5<feat.end3-feat.end5 &&
419 (p=lstrstr(seqstart+pos5, seqend, seed_5))!=NULL &&
420 (p-seqstart)<(feat.end3-feat.end5)/2 /* while hit is in 5' half! */
421 ) { /* big loop, going through all possible hits */
422 startpos=p-seqstart; /*offset from end5 */
423 locmaxscore=0;
424 locmaxpos=l5;
425 run=0;
426 locmaxrun=0;
427 p+=SEED_LEN;
428 /*
429 printf("Seed found: pos5=%d, (p-feat.seq)=%d, *p=%c%c%c%c%c\n",
430 pos5, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4));
431 */
432 score=SEED_LEN;
433 run=SEED_LEN;
434 gaplen=0;
435 while (gaplen<max_gap && score>0 && p+gaplen<=seqend) {
436 if (toupper(p[gaplen])==chr_5) {
437 if (gaplen) { /* new poly run after a gap */
438 p+=gaplen;
439 score-=gaplen*2; /* penalty */
440 gaplen=0;
441 }
442 p++;
443 run++;
444 score++;
445 }
446 else { /* non-match */
447 if (gaplen==0) { /* new gap starting or end of sequence*/
448 if (run>locmaxrun) locmaxrun=run;
449 run=0;
450 if (score>=locmaxscore) { /* check locmax! */
451 locmaxscore=score;
452 locmaxpos=(p-seqstart); /* offset from end5 */
453 }
454 }
455 gaplen++;
456 }
457 /*
458 printf("T loop: l5=%d, gaplen=%d, score=%d, locmaxpos=%d, locmaxscore=%d locmaxrun=%d\n (p-feat.seq)=%d, *p=%c%c%c%c%c\n",
459 l5, gaplen, score, locmaxpos, locmaxscore, locmaxrun, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4));
460 */
461 }/* while */
462 if (run>locmaxrun) locmaxrun=run;
463 if (score>=locmaxscore) { /* check locmax! */
464 locmaxscore=score;
465 locmaxpos=(p-seqstart); /* offset from end5 */
466 }
467 pos5=locmaxpos;
468 locminscore=min_score+startpos/10; /* min_score is dynamically adjusted
469 based on the hit distance to the ends */
470 /*
471 printf("feat.end3=%d, feat.end5=%d, l5=%d, startpos=%d, locmaxpos=%d, locmaxscore=%d, locmaxrun=%d\n",
472 feat.end3, feat.end5, l5, startpos, locmaxpos, locmaxscore, locmaxrun);
473
474 */
475 if (locmaxscore>=locminscore && locmaxrun>=min_run) { /* potential hit */
476 norange=1;
477 if (fixed_len) {
478 if (startpos<=fixed_len) {
479 l5=locmaxpos;
480 maxhit_score=locmaxscore;
481 maxhit_pos=startpos;
482 norange=0;
483 }
484 else break; /* strict fixed range criteria */
485 }
486 else { /* percentual distance test: assess distance to previous hit */
487 v=(feat.end3-feat.end5-l5)*perc_len/100;
488 if ((first_pass && v<MIN_PERC_LEN)) {
489 v=MIN_PERC_LEN;
490 first_pass=0;
491 }
492 if (startpos-l5<v) { /* hit in allowed range */
493 l5=locmaxpos;
494 maxhit_score=locmaxscore;
495 maxhit_pos=startpos;
496 norange=0;
497 }
498 }
499 /* test for any huge hits in the first half */
500 if (norange && l5!=locmaxpos &&
501 (locmaxscore>30 && startpos+(locmaxpos-startpos)/2 < (feat.end3-feat.end5)/2
502 || (feat.end3-feat.end5)-(locmaxpos-startpos)<min_read_len) )
503 {
504 l5=locmaxpos;
505 maxhit_score=locmaxscore;
506 maxhit_pos=startpos;
507 }
508 }
509 } /* big while loop */
510 feat.end5+=l5;
511 return maxhit_score*10-maxhit_pos;
512 }
513
514
515
516 int analyse3() { /*=== end3 search: polyA (if no -X) =============== */
517 int gaplen, score, locmaxpos,
518 locmaxscore, locminscore, run, locmaxrun, v;
519 char *p;
520 int first_pass=0;
521 int norange;
522 char *seqstart=feat.seq+feat.end5-1;
523 char *seqend=feat.seq+feat.end3-1;
524 int l3=0; /*offset from end3 */
525 int pos3=l3; /*distance to end3 */
526 int startpos=0;
527 int maxhit_score=0;
528 int maxhit_pos=0;
529 while (pos3<feat.end3-feat.end5 &&
530 (p=rstrstr(seqend-pos3, seqstart, seed_3))!=NULL &&
531 (seqend-p)<(feat.end3-feat.end5)/2 /* while hit is in 3' half! */
532 ) { /* big loop, going through all possible hits */
533 /* printf("feat.end3=%d, pos3=%d, startpos=%d\n",feat.end3,pos3, seqend-p); */
534 startpos=seqend-p; /*distance to seqend */
535 locmaxscore=0;
536 locmaxpos=l3;
537 run=0;
538 locmaxrun=0;
539 /*
540 printf("Seed found *p=%c%c%c%c%c%c%c\n",
541 *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6));
542 */
543 p-=SEED_LEN;
544 /*
545 printf("After seed *p=%c%c%c%c%c%c%c\n",
546 *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6));
547 */
548 /*
549 printf("Seed found: l5=%d, (p-feat.seq)=%d, *p=%c%c%c%c%c\n",
550 l5, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4) );
551 */
552 score=SEED_LEN;
553 run=SEED_LEN;
554 gaplen=0;
555 while (gaplen<max_gap && score>0 && p-gaplen>=seqstart) {
556 /*printf("A loop: l3=%d, gaplen=%d, score=%d, locmaxpos=%d, locmaxscore=%d, locmaxrun=%d\n \
557 (p-feat.seq)=%d, *p=%c%c%c%c%c%c%c\n",
558 l3, gaplen, score, locmaxpos, locmaxscore, locmaxrun,
559 p-feat.seq, *(p-gaplen),*(p-gaplen+1),*(p-gaplen+2),*(p-gaplen+3),*(p-gaplen+4),*(p-gaplen+5),
560 *(p-gaplen+6));
561 */
562 if (toupper(*(p-gaplen))==chr_3) {
563 if (gaplen) { /* new poly run after a gap */
564 p-=gaplen;
565 score-=(gaplen<<1); /* penalty */
566 gaplen=0;
567 }
568 p--;
569 run++;
570 score++;
571 /*
572 printf("match: score=%d, *p=%c%c%c%c%c%c%c\n",score,
573 *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6));
574 */
575 }
576 else { /* non-match*/
577 if (gaplen==0) { /* new gap */
578 if (run>locmaxrun) locmaxrun=run;
579 run=0;
580 if (score>=locmaxscore) { /* check locmax! */
581 locmaxscore=score;
582 locmaxpos=(seqend-p); /* distance to end3 */
583 }
584 }
585 gaplen++; /*extend gap */
586 }
587
588
589 }/* while */
590 if (run>locmaxrun) locmaxrun=run;
591 if (score>=locmaxscore) { /* check locmax! */
592 locmaxscore=score;
593 locmaxpos=(seqend-p); /* distance to end3 */
594 }
595 pos3=locmaxpos;
596 locminscore=min_score+startpos/10; /* min_score is dynamically adjusted
597 based on the hit distance to the ends */
598 if (locmaxscore>=locminscore && locmaxrun>=min_run) { /* potential hit */
599 norange=1;
600 if (fixed_len) {
601 if (startpos<=fixed_len) {
602 l3=locmaxpos;
603 maxhit_score=locmaxscore;
604 maxhit_pos=startpos;
605 norange=0;
606 }
607 else break; /* strict fixed range criteria */
608 }
609 else {
610 v=((feat.end3-l3-feat.end5)*perc_len/100);
611 if (first_pass && v<MIN_PERC_LEN) {
612 v=MIN_PERC_LEN;
613 first_pass=0;
614 }
615 if (startpos-l3<v) {
616 l3=locmaxpos;
617 maxhit_score=locmaxscore;
618 maxhit_pos=startpos;
619 norange=0;
620 }
621 }
622 /* instead, just aim for big hits in the 3' half */
623 if (norange && l3!=locmaxpos &&
624 ( (locmaxscore>30 && startpos+(locmaxpos-startpos)/2 < (feat.end3-feat.end5)/2)
625 || (feat.end3-feat.end5)-(locmaxpos-startpos)<min_read_len) ) {
626 l3=locmaxpos;
627 maxhit_score=locmaxscore;
628 maxhit_pos=startpos;
629 }
630 } /* potential hit analysis */
631 } /* big while loop */
632 feat.end3-=l3;
633 return maxhit_score*10-maxhit_pos;
634 }
635
636
637 double N_calc(int e5, int e3, int* Ncount) {
638 /* computes the percent and count of Ns; updates feat structure */
639 int i,ch;
640 double result;
641 *Ncount=0;
642 for (i=e5;i<=e3;i++) {
643 ch=toupper(feat.seq[i]);
644 if (ch!='A' && ch!='C' && ch!='G' && ch!='T')
645 (*Ncount)++;
646 }
647 result = ((double)(100*(*Ncount)))/(e3-e5+1);
648 return result;
649 }
650
651 void N_trim() {
652 /* computes initial perc_N and fill N_Pos
653 call N_analyse as apropriate (according to switches)
654 */
655 int i,j,ch;
656 feat.perc_N=N_calc(feat.end5-1, feat.end3-1, &feat.NCount);
657 /* printf("entering N trim: %f compare to %f\n",feat.perc_N, perc_N); */
658 if (perc_N!=0 && feat.perc_N>perc_N) {
659 j=0;
660 feat.NPos=malloc(feat.NCount*sizeof(int));
661 for (i=feat.end5-1;i<feat.end3;i++) {
662 ch=toupper(feat.seq[i]);
663 if (ch!='A' && ch!='C'
664 && ch!='G' && ch!='T') {
665 feat.NPos[j]=i;
666 j++;
667 }
668 }
669 N_analyse(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
670 /* compute perc_N again after trimming! */
671 feat.perc_N=N_calc(feat.end5-1, feat.end3-1, &feat.NCount);
672 if (feat.perc_N >= nsave_limit && feat.end3-feat.end5>=99) {
673 /* try to save this at any cost: */
674 N_save(feat.end5-1, feat.end3-1);
675 /* this will also update feat.perc_N and feat.NCount */
676 }
677 free(feat.NPos);
678 }
679 }
680
681 void N_analyse(int l5, int l3, int p5, int p3) {
682 /* assumes feat was filled properly */
683 int old_dif, t5,t3,v;
684 if (l3-l5<min_read_len || p5>p3 ) {
685 feat.end5=l5+1;
686 feat.end3=l3+1;
687 return;
688 }
689
690 t5=feat.NPos[p5]-l5;
691 t3=l3-feat.NPos[p3];
692 old_dif=p3-p5;
693 v=(int)((((double)(l3-l5))*perc_lenN)/100);
694 if (v>30) v=30; /* enforce limit for long ESTs */
695 if (t5 < v ) {
696 l5=feat.NPos[p5]+1;
697 p5++;
698 }
699 if (t3 < v) {
700 l3=feat.NPos[p3]-1;
701 p3--;
702 }
703 /* restNs=p3-p5; number of Ns in the new CLR */
704 if (p3-p5==old_dif) { /* no change, return */
705 /* don't trim if this may shorten < 100 */
706 if (l5-l3<min_read_len) return;
707 feat.end5=l5+1;
708 feat.end3=l3+1;
709 }
710 else
711 N_analyse(l5,l3, p5,p3);
712 }
713
714 void N_save(int l5, int l3) {
715 /* find the maximum range having percN<3 */
716 /* needs feat.NPos[], feat.NCount to be set before*/
717 int countN,i,p5,p3, max_extent, start3, max_5, max_3, e3, e5, max_countN=0;
718 double percN=0,max_percN=0;
719 for (i=0;feat.NPos[i]<=feat.end5-1;i++);
720 p5=i-1; /* found first N's position in the selected range */
721 for (i=feat.NCount-1;feat.NPos[i]>=feat.end3-1;i--);
722 p3=i+1; /* found last N's position in the selected range */
723 if (l3-l5<min_read_len || p5>p3 ) return;
724 e5=p5;e3=p3;
725 start3=l3;
726 max_extent=99;max_3=0;max_5=0;
727 while (l3-l5>max_extent /* && e3>=e5 */) { /* outer end5 loop */
728 e3=p3;
729 percN=N_calc(l5,l3, &countN);
730 while (percN>=nsave_limit && l3-l5>max_extent) { /* e3 loop -- decreases until percN<3 */
731 /* printf("l5=%d, l3=%d; e5=%d (%d), e3=%d(%d), percN=%f, %d ? %d\n",l5,l3,e5,
732 feat.NPos[e5], e3, feat.NPos[e3], percN, l3-l5, max_extent);
733 */
734 e3--;
735 while (e3>e5 && feat.NPos[e3-1]==feat.NPos[e3]-1) e3--;
736 /* skip consecutive Ns */
737 l3=feat.NPos[e3]-1;
738 percN=N_calc(l5,l3, &countN);
739 }
740 if (l3-l5>max_extent) {
741 max_extent=l3-l5;
742 max_5=l5;max_3=l3;
743 max_percN=percN;
744 max_countN=countN;
745 }
746 while (e3>e5 && feat.NPos[e5+1]==feat.NPos[e5]+1) e5++;
747 /* skip consecutive Ns */
748 l5=feat.NPos[e5]+1;
749 /* restart end3: */
750 l3=start3;e3=p3;
751 e5++;
752 } /* outer loop */
753 if (max_3!=0) {
754 feat.end5=max_5+1;
755 feat.end3=max_3+1;
756 feat.perc_N=max_percN;
757 feat.NCount=max_countN;
758 }
759 }