1 |
/* takes EST sequences from stdin only |
2 |
*/ |
3 |
|
4 |
#include <stdlib.h> |
5 |
#include <stdio.h> |
6 |
#include <string.h> |
7 |
#include "getopt.h" |
8 |
#include "trimpoly.h" |
9 |
#include "color_c.h" |
10 |
#include "ctype.h" |
11 |
|
12 |
#define USAGE "Usage: \n\ |
13 |
trimpoly [-h] [-n <percN>] [-z <min_read_len>] [-y <Nperc>] [-m <percdist>] \\ \n\ |
14 |
[-PN] [-e <end>] [-r <minrun>] [-s <minscore>] [-u <maxgap>] \\ \n\ |
15 |
[-l <percdist>|-L <range>] [-X][-D] \n\ |
16 |
\n\ |
17 |
Trims polyA/polyT and Ns from the end of the FASTA DNA sequences read\n\ |
18 |
from standard input.\n\ |
19 |
Outputs the following columns, tab-delimited:\n\ |
20 |
seq_name, perc_N, end5, end3, initial_length \n\ |
21 |
\n\ |
22 |
-n try the N-trimming if percent of N is above <percN>\n\ |
23 |
this option MUST be present if you want to enable N-trimming!\n\ |
24 |
Unless -A or -N switches are present, N trimming is done before \n\ |
25 |
polyA/T trimming\n\ |
26 |
-y try to extract from a sequence the maximum range having \n\ |
27 |
percent of N lower than <Nperc> (default: 3.00)\n\ |
28 |
-m <percdist> - incremental distance from ends, in percentage of\n\ |
29 |
sequence length, where N-trimming is done (default:12) \n\ |
30 |
The resulting distance is auto-limited to 30nt. max\n\ |
31 |
-N if -n flag was specified, do only N-trimming (skip the \n\ |
32 |
polyA/T trimming)\n\ |
33 |
-P do the polyA/T trimming first and then the N trimming;\n\ |
34 |
only makes sense if -n option is present.\n\ |
35 |
Poly A/T trimming:\n\ |
36 |
-e <end> restrict the poly trimming to one end only or to the \n\ |
37 |
maximum scoring poly match\n\ |
38 |
(default is: trim both ends)\n\ |
39 |
if <end> is 3 - trim only polyA from end3\n\ |
40 |
if <end> is 5 - trim only polyT from end5\n\ |
41 |
if <end> is x - only the maximum scoring of end3/end5 will be trimmed\n\ |
42 |
(if -X option is used, polyT/polyA will be swapped)\n\ |
43 |
-s <minscore> minimum score for poly A/T trimming (default 7)\n\ |
44 |
it is dynamically increased by 1 for every 10nt distance from the ends\n\ |
45 |
(scoring: +1 for each A/T, -2 for each intervening nucleotide) \n\ |
46 |
-r <minrun> minimum count of consecutive A/T (default 5)\n\ |
47 |
to start extending and computing score from\n\ |
48 |
-u <extstop> the count of non-matching nucleotides\n\ |
49 |
that will determine a stop in extending a match (default 4)\n\ |
50 |
-l incremental distance from ends, in percentage of length,\n\ |
51 |
where polyA/T trimming is allowed (default:15)\n\ |
52 |
-L fixed distance (in nucleotides) from both ends where\n\ |
53 |
polyA/T trimming is allowed (overrides -l)\n\ |
54 |
-o <file> outputs a multi FASTA file containing only the trimmed\n\ |
55 |
sequences \n\ |
56 |
-R relative shifting for end3, end5 will be displayed \n\ |
57 |
in the tabbed output instead of absolute coordinates\n\ |
58 |
-H parse starting end5, end3 coordinates from the FASTA file \n\ |
59 |
(expect to find them after sequence name, tab delimited)\n\ |
60 |
-X inverted trimming: trim polyA from end5 and polyT from end3\n\ |
61 |
-Y try inverted trimming if no poly(A/T) was trimmed in the first pass\n\ |
62 |
-D debug display mode: sequences are shown trimmed, colored\n\ |
63 |
using console codes (trimmed parts have a red background)\n" |
64 |
|
65 |
#define MIN_PERC_LEN 30 |
66 |
/* |
67 |
trimpoly -n2 -m5 |
68 |
seems to be a good way to trim ESTs with possible 'dirty' ends |
69 |
*/ |
70 |
|
71 |
typedef struct feature_t { |
72 |
char* seq_name; |
73 |
char* defline; |
74 |
char* seq; /* sequence, upper case, spaces, tabs, newlines are removed */ |
75 |
int seqlen; |
76 |
double perc_N; |
77 |
int end5; |
78 |
int end3; |
79 |
int* NPos; /* position of Ns in the sequence - dynamically alocated array */ |
80 |
int NCount; |
81 |
} feature_t; |
82 |
|
83 |
struct feature_t feat; /* current sequence data being analysed */ |
84 |
|
85 |
/* global options/flags: */ |
86 |
int min_read_len=32; |
87 |
int skip_poly=0; /* -N */ |
88 |
int poly_first=0; /* -P */ |
89 |
int perc_lenN=12; /* -m */ |
90 |
double perc_N=0; /* -n */ |
91 |
double nsave_limit=3.00; /* -y */ |
92 |
int trim_end=0; /* -e */ |
93 |
int min_score=7; /* -s */ |
94 |
int min_run=5; /* -r */ |
95 |
int max_gap=4; /* -u */ |
96 |
int perc_len=15; /* -l */ |
97 |
int swap_poly=0; /* -X, -Y */ |
98 |
int c_show=0; /* -D */ |
99 |
char outfile[256]; /* -o */ |
100 |
int do_write=0; /* -o */ |
101 |
int relative_shift=0; /* -R */ |
102 |
int init_ends=0; /* -H */ |
103 |
int fixed_len=0; |
104 |
char* seed_5=SEED_T; |
105 |
char* seed_3=SEED_A; |
106 |
char chr_5='T'; |
107 |
char chr_3='A'; |
108 |
int first_pass=1; |
109 |
|
110 |
void optErr(char opt); |
111 |
void showUsage(); |
112 |
char *seq_copy(char *dest, char *src, int *seqsize); |
113 |
char* lstrstr(char* lstart, char *rend, char* substr); |
114 |
char* rstrstr(char* rstart, char *lend, char* substr); |
115 |
|
116 |
/* these funcs are acting directly on feat structure */ |
117 |
int analyse3(); |
118 |
int analyse5(); |
119 |
void N_trim(); |
120 |
void N_analyse(int l5, int l3, int p5, int p3); |
121 |
void N_save(int l5, int l3); |
122 |
double N_calc(int e5, int e3, int* percN); |
123 |
|
124 |
int main(int argc, char * const argv[]) { |
125 |
int c,wpos, init_end5=0, init_end3=0, score1=0, score2=0, save5, save3; |
126 |
int alloc_size; |
127 |
int seq_size=0, readlen; |
128 |
char *p, *p_end, *bufpos; |
129 |
char seq_name[256]; |
130 |
char def_line[8192]; |
131 |
char line[121]; |
132 |
FILE * fout=NULL; |
133 |
char inbuf[BUF_LEN]; /* incoming buffer for sequence lines. */ |
134 |
char * seq; /* working buffer */ |
135 |
alloc_size=BUF_LEN; |
136 |
seq=malloc(BUF_LEN); /* initial allocation for sequence buffer */ |
137 |
while ((c=getopt(argc, argv, "hNPXYDRHz:m:n:e:r:s:l:L:o:u:y:")) > 0) { |
138 |
switch (c) |
139 |
{ case 'N': skip_poly=1;break; |
140 |
case 'P': poly_first=1;break; |
141 |
case 'm': perc_lenN=strtol(optarg, (char **)NULL, 10); |
142 |
if (perc_lenN==0) optErr('m'); |
143 |
break; |
144 |
case 'z': min_read_len=strtol(optarg, (char **)NULL, 10); |
145 |
if (min_read_len<5) optErr('z'); |
146 |
break; |
147 |
case 'n': perc_N=strtod(optarg, (char **)NULL); |
148 |
if (perc_N==0) optErr('n'); |
149 |
break; |
150 |
case 'y': nsave_limit=strtod(optarg, (char **)NULL); |
151 |
if (nsave_limit==0) optErr('y'); |
152 |
break; |
153 |
case 'e': if (optarg[0]=='x') { |
154 |
trim_end=1; |
155 |
break; |
156 |
} |
157 |
trim_end=strtol(optarg, (char **)NULL, 10); |
158 |
if ((trim_end!=3 && trim_end!=5) || trim_end==0) |
159 |
optErr('e'); |
160 |
break; |
161 |
case 'r': min_run=strtol(optarg, (char **)NULL, 10); |
162 |
if (min_run<=1) optErr('r'); |
163 |
break; |
164 |
case 'u': max_gap=strtol(optarg, (char **)NULL, 10); |
165 |
if (max_gap<=1) optErr('u'); |
166 |
break; |
167 |
case 's': min_score=strtol(optarg, (char **)NULL, 10); |
168 |
if (min_score==0) optErr('s'); |
169 |
break; |
170 |
case 'l': perc_len=strtol(optarg, (char **)NULL, 10); |
171 |
if (perc_len==0) optErr('l'); |
172 |
break; |
173 |
case 'L': fixed_len=strtol(optarg, (char **)NULL, 10); |
174 |
if (fixed_len==0) optErr('L'); |
175 |
break; |
176 |
case 'o': strcpy(outfile, optarg); |
177 |
if (strlen(outfile)==0) |
178 |
optErr('o'); |
179 |
else do_write=1; |
180 |
break; |
181 |
case 'X': swap_poly=1;break; |
182 |
case 'Y': swap_poly=2;break; |
183 |
case 'H': init_ends=1;break; |
184 |
case 'D': c_show=1;break; |
185 |
case 'h': showUsage();exit(1); |
186 |
case 'R': relative_shift=1; |
187 |
} |
188 |
} /* while options */ |
189 |
|
190 |
if (swap_poly==1) { |
191 |
seed_5=SEED_A; |
192 |
seed_3=SEED_T; |
193 |
chr_5='A'; |
194 |
chr_3='T'; |
195 |
} |
196 |
bufpos=seq; |
197 |
if (do_write) { |
198 |
if ((fout=fopen(outfile,"w"))==NULL) optErr('o'); |
199 |
} |
200 |
for (;;) { |
201 |
p_end=fgets(inbuf, BUF_LEN-1,stdin); |
202 |
/* basic sanity check: */ |
203 |
/*if (!p_end) { //wtf was I thinking? what if it's eof? |
204 |
p=strchr(inbuf, '\n'); |
205 |
if (!p || *(p+1)!='\0') { |
206 |
fprintf(stderr, "fgets error! Line too long (%s)?!\n",inbuf); |
207 |
exit(1); |
208 |
} |
209 |
}*/ |
210 |
/* is this a sequence name? */ |
211 |
p=strchr(inbuf, '>'); |
212 |
if (p || !p_end) { /* starts new sequence, accumulated old one */ |
213 |
if (bufpos!=seq) { /* close and analyse existing previous sequence */ |
214 |
/* printf("Final: seqsize=%d\n",seq_size); */ |
215 |
feat.seq=seq; |
216 |
feat.seqlen=seq_size; |
217 |
feat.seq_name=seq_name; |
218 |
if (init_ends) {/* parsed initial coordinates */ |
219 |
feat.end5=init_end5; |
220 |
feat.end3=init_end3; |
221 |
} |
222 |
else { |
223 |
feat.end5=1; |
224 |
feat.end3=feat.seqlen; |
225 |
} |
226 |
if (skip_poly) |
227 |
N_trim(); /* only test for N_trim */ |
228 |
else { |
229 |
if (poly_first==0) N_trim(); |
230 |
save5=feat.end5; |
231 |
save3=feat.end3; |
232 |
if (trim_end!=5) score1=analyse3(); |
233 |
if (trim_end!=3) score2=analyse5(); |
234 |
if (swap_poly==2 && save5==feat.end5 && |
235 |
save3==feat.end3) { |
236 |
/* printf("here: Score1=%d, Score2=%d\n", score1, score2); */ |
237 |
seed_5=SEED_A; |
238 |
seed_3=SEED_T; |
239 |
chr_5='A'; |
240 |
chr_3='T'; |
241 |
if (trim_end!=5) score1=analyse3(); |
242 |
if (trim_end!=3) score2=analyse5(); |
243 |
seed_5=SEED_T; |
244 |
seed_3=SEED_A; |
245 |
chr_5='T'; |
246 |
chr_3='A'; |
247 |
} |
248 |
/* printf("Score1=%d, Score2=%d\n", score1, score2); */ |
249 |
if (trim_end==1) { |
250 |
/* printf("score5=%d, score3=%d\n", score2,score1); */ |
251 |
if (score1<score2) feat.end3=save3; |
252 |
else feat.end5=save5; |
253 |
} |
254 |
if (poly_first!=0) N_trim(); |
255 |
} |
256 |
if (relative_shift) { |
257 |
if (init_ends) |
258 |
printf("%s\t\%4.2f\t%d\t%d\t%d\n", |
259 |
seq_name, feat.perc_N, feat.end5-init_end5,init_end3-feat.end3, |
260 |
feat.seqlen); |
261 |
else |
262 |
printf("%s\t\%4.2f\t%d\t%d\t%d\n", |
263 |
seq_name, feat.perc_N, feat.end5-1,feat.seqlen-feat.end3, |
264 |
feat.seqlen); |
265 |
} |
266 |
else |
267 |
printf("%s\t\%4.2f\t%d\t%d\t%d\n", |
268 |
seq_name, feat.perc_N, feat.end5,feat.end3, |
269 |
feat.seqlen); |
270 |
if (c_show) |
271 |
c_print_trim(seq, feat.end5, feat.end3); |
272 |
|
273 |
if (do_write && feat.perc_N<3.00 && feat.end3-feat.end5>=min_read_len-1) { |
274 |
/* write only those having length>=100 and perc_N<3*/ |
275 |
if (init_ends) fprintf(fout,">%s\n",seq_name); |
276 |
else fprintf(fout,"%s",def_line); |
277 |
wpos=feat.end5-1; |
278 |
while (feat.end3-wpos>=60) { |
279 |
strncpy(line,seq+wpos,60);line[60]='\0'; |
280 |
fprintf(fout,"%s\n",line); |
281 |
wpos+=60; |
282 |
} |
283 |
if (feat.end3-wpos>0) { |
284 |
strncpy(line,seq+wpos,feat.end3-wpos); |
285 |
line[feat.end3-wpos]='\0'; |
286 |
fprintf(fout,"%s\n",line); |
287 |
} |
288 |
} |
289 |
} |
290 |
if (!p_end) break; /* exit on end-of-file */ |
291 |
/* parse sequence name and, eventually, init coordinates */ |
292 |
strcpy(def_line, p); /* keep defline there for -o output */ |
293 |
for (p_end=p+1;(*p_end)!='\0' && !isspace(*p_end); p_end++); |
294 |
strncpy(seq_name, p+1,p_end-p-1); |
295 |
seq_name[p_end-p-1]='\0'; |
296 |
if (init_ends) { /* tab delimited fields in def_line*/ |
297 |
/* after p_end (\t) there are init values */ |
298 |
p_end=strchr(p,'\t'); |
299 |
if (p_end==NULL) optErr('H'); |
300 |
p_end++; |
301 |
p=strchr(p_end,'\t'); |
302 |
if (p==NULL) optErr('H'); |
303 |
*p='\0'; |
304 |
init_end5=strtol(p_end, (char **)NULL, 10); |
305 |
if (init_end5==0) optErr('H'); |
306 |
p++; |
307 |
p_end=strchr(p,'\n'); |
308 |
if (!p_end) p_end=strchr(p,'\t'); |
309 |
if (p_end==NULL) optErr('H'); |
310 |
*p_end='\0'; |
311 |
init_end3=strtol(p, (char **)NULL, 10); |
312 |
if (init_end3==0 || init_end3<init_end5) optErr('H'); |
313 |
} |
314 |
seq_size=0; |
315 |
bufpos=seq; /* no need to reallocate the buffer for each sequence.. |
316 |
let it stay at maximum all along */ |
317 |
} |
318 |
else { /* extending current sequence : */ |
319 |
readlen=strlen(inbuf); |
320 |
if (seq_size+readlen>alloc_size) { /* must realloc, it's gonna blow */ |
321 |
alloc_size+=BUF_LEN; |
322 |
seq=realloc(seq, alloc_size); |
323 |
/* printf("Realloc! seqsize=%d, readlen=%d\n",seq_size, readlen); */ |
324 |
bufpos=seq+seq_size; |
325 |
} |
326 |
bufpos=seq_copy(bufpos,inbuf, &seq_size); |
327 |
/* append, removing newline chars and incrementing seq_size accordingly */ |
328 |
} |
329 |
}/* for input */ |
330 |
free(seq); |
331 |
if (do_write) fclose(fout); |
332 |
return 0; |
333 |
} |
334 |
|
335 |
char* seq_copy(char *dest, char *src, int *seq_size) { |
336 |
/* assume destination already has room for all the chars in src! |
337 |
skip any blanks or newlines on copying |
338 |
RETURN: the position immediately after the last char copied! |
339 |
*/ |
340 |
int i,j; |
341 |
j=0; |
342 |
for (i=0;(src[i]!='\0');i++) { |
343 |
if (src[i]==' ' || src[i]=='\n'|| src[i]=='\t') continue; |
344 |
/* dest[j]=toupper(src[i]);j++; */ |
345 |
dest[j]=src[i];j++; |
346 |
} |
347 |
dest[j]='\0'; |
348 |
(*seq_size) += j; |
349 |
return &dest[j]; |
350 |
} |
351 |
|
352 |
|
353 |
|
354 |
/*********************************************************************/ |
355 |
/* Helper functions */ |
356 |
/*********************************************************************/ |
357 |
|
358 |
|
359 |
|
360 |
void optErr(char opt) { |
361 |
showUsage(); |
362 |
fprintf(stderr, "\nIncorrect value specified for -%c option!\n", opt); |
363 |
exit(1); |
364 |
} |
365 |
|
366 |
void showUsage() { |
367 |
fprintf(stderr, USAGE); |
368 |
} |
369 |
|
370 |
|
371 |
char* lstrstr(char* lstart, char *rend, char* substr) { /*like strstr, but starts searching |
372 |
from lstart, going up to rend and returns a pointer to the first (left) |
373 |
matching character in str */ |
374 |
char *p; |
375 |
int l,i; |
376 |
l=strlen(substr); |
377 |
p=lstart; |
378 |
while (p+l<rend) { |
379 |
for (i=0;i<l;i++) if (toupper(*(p+i)) != toupper(*(substr+i))) break; |
380 |
if (i==l) return p; |
381 |
p++; |
382 |
} |
383 |
return NULL; |
384 |
} |
385 |
|
386 |
|
387 |
char* rstrstr(char* rstart, char *lend, char* substr) { /*like strstr, but starts searching |
388 |
from right end, going up to lend and returns a pointer to the last (right) |
389 |
matching character in str */ |
390 |
char *p; |
391 |
int l,i; |
392 |
l=strlen(substr); |
393 |
p=rstart-l+1; |
394 |
while (p>=lend) { |
395 |
for (i=0;i<l;i++) if (toupper(*(p+i)) != toupper(*(substr+i))) break; |
396 |
if (i==l) return p+l-1; |
397 |
p--; |
398 |
} |
399 |
return NULL; |
400 |
} |
401 |
|
402 |
/*=================================================*/ |
403 |
|
404 |
int analyse5() { /*== end5 trim (polyT if no -X) ======*/ |
405 |
int gaplen, locmaxpos, |
406 |
locmaxscore, locminscore, run, locmaxrun, v; |
407 |
char *p; |
408 |
int score; |
409 |
int first_pass=0; |
410 |
int l5=0; /* offset from current end5 */ |
411 |
int norange=1; |
412 |
int maxhit_score=0; |
413 |
int maxhit_pos=0; |
414 |
int startpos=0; |
415 |
int pos5=l5; /* distance to end5 */ |
416 |
char *seqstart=feat.seq+feat.end5-1; |
417 |
char *seqend=feat.seq+feat.end3-1; |
418 |
while (pos5<feat.end3-feat.end5 && |
419 |
(p=lstrstr(seqstart+pos5, seqend, seed_5))!=NULL && |
420 |
(p-seqstart)<(feat.end3-feat.end5)/2 /* while hit is in 5' half! */ |
421 |
) { /* big loop, going through all possible hits */ |
422 |
startpos=p-seqstart; /*offset from end5 */ |
423 |
locmaxscore=0; |
424 |
locmaxpos=l5; |
425 |
run=0; |
426 |
locmaxrun=0; |
427 |
p+=SEED_LEN; |
428 |
/* |
429 |
printf("Seed found: pos5=%d, (p-feat.seq)=%d, *p=%c%c%c%c%c\n", |
430 |
pos5, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4)); |
431 |
*/ |
432 |
score=SEED_LEN; |
433 |
run=SEED_LEN; |
434 |
gaplen=0; |
435 |
while (gaplen<max_gap && score>0 && p+gaplen<=seqend) { |
436 |
if (toupper(p[gaplen])==chr_5) { |
437 |
if (gaplen) { /* new poly run after a gap */ |
438 |
p+=gaplen; |
439 |
score-=gaplen*2; /* penalty */ |
440 |
gaplen=0; |
441 |
} |
442 |
p++; |
443 |
run++; |
444 |
score++; |
445 |
} |
446 |
else { /* non-match */ |
447 |
if (gaplen==0) { /* new gap starting or end of sequence*/ |
448 |
if (run>locmaxrun) locmaxrun=run; |
449 |
run=0; |
450 |
if (score>=locmaxscore) { /* check locmax! */ |
451 |
locmaxscore=score; |
452 |
locmaxpos=(p-seqstart); /* offset from end5 */ |
453 |
} |
454 |
} |
455 |
gaplen++; |
456 |
} |
457 |
/* |
458 |
printf("T loop: l5=%d, gaplen=%d, score=%d, locmaxpos=%d, locmaxscore=%d locmaxrun=%d\n (p-feat.seq)=%d, *p=%c%c%c%c%c\n", |
459 |
l5, gaplen, score, locmaxpos, locmaxscore, locmaxrun, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4)); |
460 |
*/ |
461 |
}/* while */ |
462 |
if (run>locmaxrun) locmaxrun=run; |
463 |
if (score>=locmaxscore) { /* check locmax! */ |
464 |
locmaxscore=score; |
465 |
locmaxpos=(p-seqstart); /* offset from end5 */ |
466 |
} |
467 |
pos5=locmaxpos; |
468 |
locminscore=min_score+startpos/10; /* min_score is dynamically adjusted |
469 |
based on the hit distance to the ends */ |
470 |
/* |
471 |
printf("feat.end3=%d, feat.end5=%d, l5=%d, startpos=%d, locmaxpos=%d, locmaxscore=%d, locmaxrun=%d\n", |
472 |
feat.end3, feat.end5, l5, startpos, locmaxpos, locmaxscore, locmaxrun); |
473 |
|
474 |
*/ |
475 |
if (locmaxscore>=locminscore && locmaxrun>=min_run) { /* potential hit */ |
476 |
norange=1; |
477 |
if (fixed_len) { |
478 |
if (startpos<=fixed_len) { |
479 |
l5=locmaxpos; |
480 |
maxhit_score=locmaxscore; |
481 |
maxhit_pos=startpos; |
482 |
norange=0; |
483 |
} |
484 |
else break; /* strict fixed range criteria */ |
485 |
} |
486 |
else { /* percentual distance test: assess distance to previous hit */ |
487 |
v=(feat.end3-feat.end5-l5)*perc_len/100; |
488 |
if ((first_pass && v<MIN_PERC_LEN)) { |
489 |
v=MIN_PERC_LEN; |
490 |
first_pass=0; |
491 |
} |
492 |
if (startpos-l5<v) { /* hit in allowed range */ |
493 |
l5=locmaxpos; |
494 |
maxhit_score=locmaxscore; |
495 |
maxhit_pos=startpos; |
496 |
norange=0; |
497 |
} |
498 |
} |
499 |
/* test for any huge hits in the first half */ |
500 |
if (norange && l5!=locmaxpos && |
501 |
(locmaxscore>30 && startpos+(locmaxpos-startpos)/2 < (feat.end3-feat.end5)/2 |
502 |
|| (feat.end3-feat.end5)-(locmaxpos-startpos)<min_read_len) ) |
503 |
{ |
504 |
l5=locmaxpos; |
505 |
maxhit_score=locmaxscore; |
506 |
maxhit_pos=startpos; |
507 |
} |
508 |
} |
509 |
} /* big while loop */ |
510 |
feat.end5+=l5; |
511 |
return maxhit_score*10-maxhit_pos; |
512 |
} |
513 |
|
514 |
|
515 |
|
516 |
int analyse3() { /*=== end3 search: polyA (if no -X) =============== */ |
517 |
int gaplen, score, locmaxpos, |
518 |
locmaxscore, locminscore, run, locmaxrun, v; |
519 |
char *p; |
520 |
int first_pass=0; |
521 |
int norange; |
522 |
char *seqstart=feat.seq+feat.end5-1; |
523 |
char *seqend=feat.seq+feat.end3-1; |
524 |
int l3=0; /*offset from end3 */ |
525 |
int pos3=l3; /*distance to end3 */ |
526 |
int startpos=0; |
527 |
int maxhit_score=0; |
528 |
int maxhit_pos=0; |
529 |
while (pos3<feat.end3-feat.end5 && |
530 |
(p=rstrstr(seqend-pos3, seqstart, seed_3))!=NULL && |
531 |
(seqend-p)<(feat.end3-feat.end5)/2 /* while hit is in 3' half! */ |
532 |
) { /* big loop, going through all possible hits */ |
533 |
/* printf("feat.end3=%d, pos3=%d, startpos=%d\n",feat.end3,pos3, seqend-p); */ |
534 |
startpos=seqend-p; /*distance to seqend */ |
535 |
locmaxscore=0; |
536 |
locmaxpos=l3; |
537 |
run=0; |
538 |
locmaxrun=0; |
539 |
/* |
540 |
printf("Seed found *p=%c%c%c%c%c%c%c\n", |
541 |
*p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6)); |
542 |
*/ |
543 |
p-=SEED_LEN; |
544 |
/* |
545 |
printf("After seed *p=%c%c%c%c%c%c%c\n", |
546 |
*p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6)); |
547 |
*/ |
548 |
/* |
549 |
printf("Seed found: l5=%d, (p-feat.seq)=%d, *p=%c%c%c%c%c\n", |
550 |
l5, p-feat.seq, *p,*(p+1),*(p+2),*(p+3),*(p+4) ); |
551 |
*/ |
552 |
score=SEED_LEN; |
553 |
run=SEED_LEN; |
554 |
gaplen=0; |
555 |
while (gaplen<max_gap && score>0 && p-gaplen>=seqstart) { |
556 |
/*printf("A loop: l3=%d, gaplen=%d, score=%d, locmaxpos=%d, locmaxscore=%d, locmaxrun=%d\n \ |
557 |
(p-feat.seq)=%d, *p=%c%c%c%c%c%c%c\n", |
558 |
l3, gaplen, score, locmaxpos, locmaxscore, locmaxrun, |
559 |
p-feat.seq, *(p-gaplen),*(p-gaplen+1),*(p-gaplen+2),*(p-gaplen+3),*(p-gaplen+4),*(p-gaplen+5), |
560 |
*(p-gaplen+6)); |
561 |
*/ |
562 |
if (toupper(*(p-gaplen))==chr_3) { |
563 |
if (gaplen) { /* new poly run after a gap */ |
564 |
p-=gaplen; |
565 |
score-=(gaplen<<1); /* penalty */ |
566 |
gaplen=0; |
567 |
} |
568 |
p--; |
569 |
run++; |
570 |
score++; |
571 |
/* |
572 |
printf("match: score=%d, *p=%c%c%c%c%c%c%c\n",score, |
573 |
*p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5),*(p+6)); |
574 |
*/ |
575 |
} |
576 |
else { /* non-match*/ |
577 |
if (gaplen==0) { /* new gap */ |
578 |
if (run>locmaxrun) locmaxrun=run; |
579 |
run=0; |
580 |
if (score>=locmaxscore) { /* check locmax! */ |
581 |
locmaxscore=score; |
582 |
locmaxpos=(seqend-p); /* distance to end3 */ |
583 |
} |
584 |
} |
585 |
gaplen++; /*extend gap */ |
586 |
} |
587 |
|
588 |
|
589 |
}/* while */ |
590 |
if (run>locmaxrun) locmaxrun=run; |
591 |
if (score>=locmaxscore) { /* check locmax! */ |
592 |
locmaxscore=score; |
593 |
locmaxpos=(seqend-p); /* distance to end3 */ |
594 |
} |
595 |
pos3=locmaxpos; |
596 |
locminscore=min_score+startpos/10; /* min_score is dynamically adjusted |
597 |
based on the hit distance to the ends */ |
598 |
if (locmaxscore>=locminscore && locmaxrun>=min_run) { /* potential hit */ |
599 |
norange=1; |
600 |
if (fixed_len) { |
601 |
if (startpos<=fixed_len) { |
602 |
l3=locmaxpos; |
603 |
maxhit_score=locmaxscore; |
604 |
maxhit_pos=startpos; |
605 |
norange=0; |
606 |
} |
607 |
else break; /* strict fixed range criteria */ |
608 |
} |
609 |
else { |
610 |
v=((feat.end3-l3-feat.end5)*perc_len/100); |
611 |
if (first_pass && v<MIN_PERC_LEN) { |
612 |
v=MIN_PERC_LEN; |
613 |
first_pass=0; |
614 |
} |
615 |
if (startpos-l3<v) { |
616 |
l3=locmaxpos; |
617 |
maxhit_score=locmaxscore; |
618 |
maxhit_pos=startpos; |
619 |
norange=0; |
620 |
} |
621 |
} |
622 |
/* instead, just aim for big hits in the 3' half */ |
623 |
if (norange && l3!=locmaxpos && |
624 |
( (locmaxscore>30 && startpos+(locmaxpos-startpos)/2 < (feat.end3-feat.end5)/2) |
625 |
|| (feat.end3-feat.end5)-(locmaxpos-startpos)<min_read_len) ) { |
626 |
l3=locmaxpos; |
627 |
maxhit_score=locmaxscore; |
628 |
maxhit_pos=startpos; |
629 |
} |
630 |
} /* potential hit analysis */ |
631 |
} /* big while loop */ |
632 |
feat.end3-=l3; |
633 |
return maxhit_score*10-maxhit_pos; |
634 |
} |
635 |
|
636 |
|
637 |
double N_calc(int e5, int e3, int* Ncount) { |
638 |
/* computes the percent and count of Ns; updates feat structure */ |
639 |
int i,ch; |
640 |
double result; |
641 |
*Ncount=0; |
642 |
for (i=e5;i<=e3;i++) { |
643 |
ch=toupper(feat.seq[i]); |
644 |
if (ch!='A' && ch!='C' && ch!='G' && ch!='T') |
645 |
(*Ncount)++; |
646 |
} |
647 |
result = ((double)(100*(*Ncount)))/(e3-e5+1); |
648 |
return result; |
649 |
} |
650 |
|
651 |
void N_trim() { |
652 |
/* computes initial perc_N and fill N_Pos |
653 |
call N_analyse as apropriate (according to switches) |
654 |
*/ |
655 |
int i,j,ch; |
656 |
feat.perc_N=N_calc(feat.end5-1, feat.end3-1, &feat.NCount); |
657 |
/* printf("entering N trim: %f compare to %f\n",feat.perc_N, perc_N); */ |
658 |
if (perc_N!=0 && feat.perc_N>perc_N) { |
659 |
j=0; |
660 |
feat.NPos=malloc(feat.NCount*sizeof(int)); |
661 |
for (i=feat.end5-1;i<feat.end3;i++) { |
662 |
ch=toupper(feat.seq[i]); |
663 |
if (ch!='A' && ch!='C' |
664 |
&& ch!='G' && ch!='T') { |
665 |
feat.NPos[j]=i; |
666 |
j++; |
667 |
} |
668 |
} |
669 |
N_analyse(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
670 |
/* compute perc_N again after trimming! */ |
671 |
feat.perc_N=N_calc(feat.end5-1, feat.end3-1, &feat.NCount); |
672 |
if (feat.perc_N >= nsave_limit && feat.end3-feat.end5>=99) { |
673 |
/* try to save this at any cost: */ |
674 |
N_save(feat.end5-1, feat.end3-1); |
675 |
/* this will also update feat.perc_N and feat.NCount */ |
676 |
} |
677 |
free(feat.NPos); |
678 |
} |
679 |
} |
680 |
|
681 |
void N_analyse(int l5, int l3, int p5, int p3) { |
682 |
/* assumes feat was filled properly */ |
683 |
int old_dif, t5,t3,v; |
684 |
if (l3-l5<min_read_len || p5>p3 ) { |
685 |
feat.end5=l5+1; |
686 |
feat.end3=l3+1; |
687 |
return; |
688 |
} |
689 |
|
690 |
t5=feat.NPos[p5]-l5; |
691 |
t3=l3-feat.NPos[p3]; |
692 |
old_dif=p3-p5; |
693 |
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
694 |
if (v>30) v=30; /* enforce limit for long ESTs */ |
695 |
if (t5 < v ) { |
696 |
l5=feat.NPos[p5]+1; |
697 |
p5++; |
698 |
} |
699 |
if (t3 < v) { |
700 |
l3=feat.NPos[p3]-1; |
701 |
p3--; |
702 |
} |
703 |
/* restNs=p3-p5; number of Ns in the new CLR */ |
704 |
if (p3-p5==old_dif) { /* no change, return */ |
705 |
/* don't trim if this may shorten < 100 */ |
706 |
if (l5-l3<min_read_len) return; |
707 |
feat.end5=l5+1; |
708 |
feat.end3=l3+1; |
709 |
} |
710 |
else |
711 |
N_analyse(l5,l3, p5,p3); |
712 |
} |
713 |
|
714 |
void N_save(int l5, int l3) { |
715 |
/* find the maximum range having percN<3 */ |
716 |
/* needs feat.NPos[], feat.NCount to be set before*/ |
717 |
int countN,i,p5,p3, max_extent, start3, max_5, max_3, e3, e5, max_countN=0; |
718 |
double percN=0,max_percN=0; |
719 |
for (i=0;feat.NPos[i]<=feat.end5-1;i++); |
720 |
p5=i-1; /* found first N's position in the selected range */ |
721 |
for (i=feat.NCount-1;feat.NPos[i]>=feat.end3-1;i--); |
722 |
p3=i+1; /* found last N's position in the selected range */ |
723 |
if (l3-l5<min_read_len || p5>p3 ) return; |
724 |
e5=p5;e3=p3; |
725 |
start3=l3; |
726 |
max_extent=99;max_3=0;max_5=0; |
727 |
while (l3-l5>max_extent /* && e3>=e5 */) { /* outer end5 loop */ |
728 |
e3=p3; |
729 |
percN=N_calc(l5,l3, &countN); |
730 |
while (percN>=nsave_limit && l3-l5>max_extent) { /* e3 loop -- decreases until percN<3 */ |
731 |
/* printf("l5=%d, l3=%d; e5=%d (%d), e3=%d(%d), percN=%f, %d ? %d\n",l5,l3,e5, |
732 |
feat.NPos[e5], e3, feat.NPos[e3], percN, l3-l5, max_extent); |
733 |
*/ |
734 |
e3--; |
735 |
while (e3>e5 && feat.NPos[e3-1]==feat.NPos[e3]-1) e3--; |
736 |
/* skip consecutive Ns */ |
737 |
l3=feat.NPos[e3]-1; |
738 |
percN=N_calc(l5,l3, &countN); |
739 |
} |
740 |
if (l3-l5>max_extent) { |
741 |
max_extent=l3-l5; |
742 |
max_5=l5;max_3=l3; |
743 |
max_percN=percN; |
744 |
max_countN=countN; |
745 |
} |
746 |
while (e3>e5 && feat.NPos[e5+1]==feat.NPos[e5]+1) e5++; |
747 |
/* skip consecutive Ns */ |
748 |
l5=feat.NPos[e5]+1; |
749 |
/* restart end3: */ |
750 |
l3=start3;e3=p3; |
751 |
e5++; |
752 |
} /* outer loop */ |
753 |
if (max_3!=0) { |
754 |
feat.end5=max_5+1; |
755 |
feat.end3=max_3+1; |
756 |
feat.perc_N=max_percN; |
757 |
feat.NCount=max_countN; |
758 |
} |
759 |
} |