ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/seqalign.works.cpp
Revision: 74
Committed: Tue Sep 27 01:22:37 2011 UTC (12 years, 11 months ago) by gpertea
File size: 22640 byte(s)
Log Message:
added seqalign files

Line File contents
1 #include "GBase.h"
2 #include "GArgs.h"
3 #include "GStr.h"
4 #include "GList.hh"
5 //#include "GXDropAlign.h"
6
7 #define USAGE "Usage:\n\
8 seqalign -i <2line_file> | -a <seq1> -b <seq2>\n"
9
10 bool debug=true;
11
12 struct SEditScript{
13 uint32 *op; // array of edit operations
14 uint32 size, num; // size of allocation, number in use
15 uint32 last; // most recent operation added
16 };
17 /*
18 SEditScript *edit_script_free(SEditScript *es);
19 SEditScript *edit_script_new(void);
20 SEditScript *edit_script_append(SEditScript *es, SEditScript *et);
21 */
22 enum {
23 EDIT_OP_MASK = 0x3,
24 EDIT_OP_ERR = 0x0,
25 EDIT_OP_INS = 0x1,
26 EDIT_OP_DEL = 0x2,
27 EDIT_OP_REP = 0x3
28 };
29
30 //enum { /* half of the (fixed) match score */
31 // ERROR_FRACTION=2, /* 1/this */
32 // MAX_SPACE=1000000,
33 // sC = 0, sI = 1, sD = 2, LARGE=100000000
34 //};
35
36 #define ERROR_FRACTION 2 /* 1/this */
37 #define MAX_SPACE 1000000
38 #define LARGE 100000000
39 int sC = 0, sI = 1, sD = 2;
40
41 uint32 edit_val_get(uint32 op) {
42 return op >> 2;
43 }
44 uint32 edit_opc_get(uint32 op) {
45 return op & EDIT_OP_MASK;
46 }
47
48 uint32 edit_op_cons(uint32 op, uint32 val) {
49 return (val << 2) | (op & EDIT_OP_MASK);
50 }
51
52 uint32 edit_op_inc(uint32 op, uint32 n) {
53 return edit_op_cons(edit_opc_get(op), edit_val_get(op) + n);
54 }
55
56 uint32 edit_op_inc_last(SEditScript *es, uint32 n) {
57 uint32 *last;
58 //ASSERT (es->num > 0);
59 last = &(es->op[es->num-1]);
60 *last = edit_op_inc(*last, n);
61 return *last;
62 }
63
64 int edit_script_ready(SEditScript *es, uint32 n) {
65 //uint32 *p;
66 uint32 m = n + n/2;
67 if (es->size <= n) {
68 GREALLOC(es->op, m*sizeof(uint32));
69 //es->op = p;
70 es->size = m;
71 }
72 return 1;
73 }
74
75 int edit_script_readyplus(SEditScript *es, uint32 n) {
76 if (es->size - es->num <= n)
77 return edit_script_ready(es, n + es->num);
78 return 1;
79 }
80
81 int edit_script_put(SEditScript *es, uint32 op, uint32 n) {
82 if (!edit_script_readyplus(es, 2))
83 return 0;
84 es->last = op;
85 //ASSERT(op != 0);
86 es->op[es->num] = edit_op_cons(op, n);
87 es->num += 1;
88 es->op[es->num] = 0; /* sentinal */
89
90 return 1;
91 }
92
93 SEditScript *edit_script_init(SEditScript *es) {
94 es->op = 0;
95 es->size = es->num = 0;
96 es->last = 0;
97 edit_script_ready(es, 8);
98 return es;
99 }
100
101 uint32 *edit_script_first(SEditScript *es) {
102 return es->num > 0 ? &es->op[0] : 0;
103 }
104
105 uint32 *edit_script_next(SEditScript *es, uint32 *op) {
106 // assumes flat address space !
107 if (&es->op[0] <= op && op < &es->op[es->num-1])
108 return op+1;
109 else
110 return 0;
111 }
112
113 static int edit_script_more(SEditScript *data, uint32 op, uint32 k) {
114 if (op == EDIT_OP_ERR) {
115 GError("edit_script_more: bad opcode %d:%d", op, k);
116 return -1;
117 }
118
119 if (edit_opc_get(data->last) == op)
120 edit_op_inc_last(data, k);
121 else
122 edit_script_put(data, op, k);
123
124 return 0;
125 }
126
127 /* External */
128 SEditScript *edit_script_append(SEditScript *es, SEditScript *et) {
129 uint32 *op;
130 for (op = edit_script_first(et); op; op = edit_script_next(et, op))
131 edit_script_more(es, edit_opc_get(*op), edit_val_get(*op));
132 return es;
133 }
134
135 SEditScript *edit_script_new(void) {
136 SEditScript *es = NULL;
137 GCALLOC(es, sizeof(*es));
138 return edit_script_init(es);
139 }
140
141 SEditScript *edit_script_free(SEditScript *es) {
142 if (es) {
143 if (es->op) {
144 //MemFree(es->op);
145 GFREE(es->op);
146 }
147 //MemSet(es, 0, sizeof(*es));
148 //MemFree(es);
149 GFREE(es);
150 }
151 return 0;
152 }
153
154 int edit_script_del(SEditScript *data, uint32 k) {
155 return edit_script_more(data, EDIT_OP_DEL, k);
156 }
157
158 int edit_script_ins(SEditScript *data, uint32 k) {
159 return edit_script_more(data, EDIT_OP_INS, k);
160 }
161 int edit_script_rep(SEditScript *data, uint32 k) {
162 return edit_script_more(data, EDIT_OP_REP, k);
163 }
164
165 SEditScript *edit_script_reverse_inplace(SEditScript *es) {
166 uint32 i;
167 const uint32 num = es->num;
168 const uint32 mid = num/2;
169 const uint32 end = num-1;
170
171 for (i = 0; i < mid; ++i) {
172 const uint32 t = es->op[i];
173 es->op[i] = es->op[end-i];
174 es->op[end-i] = t;
175 }
176 return es;
177 }
178
179 /* ----- pool allocator ----- */
180 struct ThreeVal {
181 int I, C, D;
182 };
183
184 struct MBSpace {
185 ThreeVal* space_array;
186 int used, size;
187 MBSpace *next;
188 };
189
190 MBSpace* new_mb_space() {
191 MBSpace* p;
192 int amount;
193
194 GMALLOC(p, sizeof(MBSpace));
195 amount = MAX_SPACE;
196 GCALLOC(p->space_array, sizeof(ThreeVal)*amount);
197 if (p->space_array == NULL) {
198 GFREE(p);
199 return NULL;
200 }
201 p->used = 0;
202 p->size = amount;
203 p->next = NULL;
204
205 return p;
206 }
207
208 void refresh_mb_space(MBSpace* sp)
209 {
210 while (sp) {
211 sp->used = 0;
212 sp = sp->next;
213 }
214 }
215
216 void free_mb_space(MBSpace* sp)
217 {
218 MBSpace* next_sp;
219 while (sp) {
220 next_sp = sp->next;
221 //sp->space_array = MemFree(sp->space_array);
222 GFREE(sp->space_array);
223 //sp = MemFree(sp);
224 GFREE(sp);
225 sp = next_sp;
226 }
227 }
228
229 ThreeVal* get_mb_space(MBSpace* S, int amount)
230 {
231 ThreeVal* s;
232 if (amount < 0)
233 return NULL;
234
235 while (S->used+amount > S->size) {
236 if (S->next == NULL)
237 if ((S->next = new_mb_space()) == NULL) {
238 GError("Cannot get new space for greedy extension!\n");
239 return NULL;
240 }
241 S = S->next;
242 }
243 s = S->space_array+S->used;
244 S->used += amount;
245 return s;
246 }
247
248 struct GreedyAlignMem {
249 int** flast_d;
250 int* max_row_free;
251 int* uplow_free;
252 MBSpace* space;
253 int max_d;
254 GreedyAlignMem(int max_len, int reward, int penalty, int Xdrop) {
255 flast_d=NULL;
256 max_row_free=NULL;
257 max_d=0;
258 uplow_free=NULL;
259 space=NULL;
260 max_d = (int)(max_len / ERROR_FRACTION + 1);
261 int d_diff = (Xdrop+reward/2)/(penalty+reward) + 1;
262 //flast_d = (int**) malloc((max_d + 2) * sizeof(int));
263 GCALLOC(flast_d, (max_d+2)* sizeof(int));
264 if (flast_d == NULL) {
265 GError("Error: cannot allocate GreedyAlignMem structure!\n");
266 }
267 flast_d[0] = (int*)malloc((max_d + max_d + 6) * sizeof(int) * 2);
268 if (flast_d[0] == NULL) {
269 GError("Error: failed to allocate flast_d[0] in GreedyAlignMem!\n");
270 }
271 flast_d[1] = flast_d[0] + max_d + max_d + 6;
272 //for (int i=2;i<max_d+2;i++) {
273 // flast_d[i]=NULL;
274 // }
275 //search->abmp->flast_d_affine = NULL;
276 uplow_free = NULL;
277 max_row_free = (int*) malloc(sizeof(int) * (max_d + 1 + d_diff));
278 space = new_mb_space();
279 }
280 ~GreedyAlignMem() {
281 free_mb_space(space);
282 GFREE(max_row_free);
283 //GFREE(space);
284 GFREE(flast_d[0]);
285 //for (int i=0;i<max_d+2;i++) {
286 // GFREE(flast_d[i]);
287 // }
288
289 GFREE(flast_d);
290 }
291 };
292
293
294 int get_last(int **flast_d, int d, int diag, int *row1) {
295 if (flast_d[d-1][diag-1] > GMAX(flast_d[d-1][diag], flast_d[d-1][diag+1])) {
296 *row1 = flast_d[d-1][diag-1];
297 return diag-1;
298 }
299 if (flast_d[d-1][diag] > flast_d[d-1][diag+1]) {
300 *row1 = flast_d[d-1][diag];
301 return diag;
302 }
303 *row1 = flast_d[d-1][diag+1];
304 return diag+1;
305 }
306
307
308 int MegaBlastGreedyAlign(const char* s1, int len1,
309 const char* s2, int len2,
310 bool reverse, int xdrop_threshold,
311 int match_cost, int mismatch_cost,
312 int* e1, int* e2,
313 GreedyAlignMem* abmp, SEditScript *S)
314 // Uint1 rem)
315 {
316 int col, /* column number */
317 d, /* current distance */
318 k, /* current diagonal */
319 flower, fupper, /* boundaries for searching diagonals */
320 row, /* row number */
321 MAX_D, /* maximum cost */
322 ORIGIN,
323 return_val = 0;
324 int** flast_d = abmp->flast_d; /* rows containing the last d */
325 int* max_row; /* reached for cost d=0, ... len1. */
326
327 int X_pen = xdrop_threshold;
328 int M_half = match_cost/2;
329 int Op_cost = mismatch_cost + M_half*2;
330 int D_diff = (X_pen+M_half) / Op_cost + 1;
331 int x, cur_max, b_diag = 0, best_diag = INT_MAX/2;
332 int* max_row_free = abmp->max_row_free;
333 char nlower = 0, nupper = 0;
334 MBSpace* space = abmp->space;
335 int max_len = len2;
336
337 MAX_D = (int) (len1/ERROR_FRACTION + 1);
338 ORIGIN = MAX_D + 2;
339 GMessage("D_diff=%d, MAX_D=%d\n", D_diff, MAX_D);
340
341 *e1 = *e2 = 0;
342 //find first mismatch
343 if (reverse) {
344 /*
345 if (!(rem & 4)) {
346 for (row = 0; row < len2 && row < len1 &&
347 (s2[len2-1-row] ==
348 READDB_UNPACK_BASE_N(s1[(len1-1-row)/4],
349 3-(len1-1-row)%4));
350 row++)
351 ; //empty
352 } else {
353 */
354 for (row = 0; row < len2 && row < len1 && (s2[len2-1-row] == s1[len1-1-row]); row++)
355 ; //empty
356 //}
357 } else {
358 /*
359 if (!(rem & 4)) {
360 for (row = 0; row < len2 && row < len1 &&
361 (s2[row] ==
362 READDB_UNPACK_BASE_N(s1[(row+rem)/4],
363 3-(row+rem)%4));
364 row++)
365 ; //empty
366 } else { */
367 for (row = 0; row < len2 && row < len1 && (s2[row] == s1[row]); row++)
368 ; //empty
369 //}
370 }
371 *e1 = row;
372 *e2 = row;
373 if (row == len1) {
374 if (S != NULL)
375 edit_script_rep(S, row);
376 // hit last row; stop search
377 return 0;
378 }
379 if (S==NULL)
380 space = 0;
381 else if (!space)
382 abmp->space = space = new_mb_space();
383 else
384 refresh_mb_space(space);
385
386 max_row = max_row_free + D_diff;
387 for (k = 0; k < D_diff; k++)
388 max_row_free[k] = 0;
389
390 flast_d[0][ORIGIN] = row;
391 max_row[0] = (row + row)*M_half;
392
393 flower = ORIGIN - 1;
394 fupper = ORIGIN + 1;
395
396 d = 1;
397 while (d <= MAX_D) {
398 int fl0, fu0;
399 flast_d[d - 1][flower - 1] = flast_d[d - 1][flower] = -1;
400 flast_d[d - 1][fupper] = flast_d[d - 1][fupper + 1] = -1;
401 x = max_row[d - D_diff] + Op_cost * d - X_pen;
402 x = (int)ceil(x/M_half);
403 cur_max = 0;
404 fl0 = flower;
405 fu0 = fupper;
406 for (k = fl0; k <= fu0; k++) {
407 row = GMAX(flast_d[d - 1][k + 1], flast_d[d - 1][k]) + 1;
408 row = GMAX(row, flast_d[d - 1][k - 1]);
409 col = row + k - ORIGIN;
410 if (row + col >= x)
411 fupper = k;
412 else {
413 if (k == flower)
414 flower++;
415 else
416 flast_d[d][k] = -1;
417 continue;
418 }
419 if (row > max_len || row < 0) {
420 flower = k+1; nlower = 1;
421 } else {
422 /* Slide down the diagonal. Don't do this if reached
423 the end point, which has value 0x0f */
424 if (reverse) {
425 if (s2[len2-row] != 0x0f) {
426 /*if (!(rem & 4))
427 while (row < len2 && col < len1 && s2[len2-1-row] ==
428 READDB_UNPACK_BASE_N(s1[(len1-1-col)/4],
429 3-(len1-1-col)%4)) {
430 ++row;
431 ++col;
432 }
433 else */
434 while (row < len2 && col < len1 && s2[len2-1-row] == s1[len1-1-col]) {
435 ++row;
436 ++col;
437 }
438 } else {
439 max_len = row;
440 flower = k+1; nlower = 1;
441 }
442 } else if (s2[row-1] != 0x0f) {
443 /*if (!(rem & 4)) {
444 while (row < len2 && col < len1 && s2[row] ==
445 READDB_UNPACK_BASE_N(s1[(col+rem)/4],
446 3-(col+rem)%4)) {
447 ++row;
448 ++col;
449 }
450 } else { */
451 while (row < len2 && col < len1 && s2[row] == s1[col]) {
452 ++row;
453 ++col;
454 }
455 // }
456 } else {
457 max_len = row;
458 flower = k+1; nlower = 1;
459 }
460 }
461 flast_d[d][k] = row;
462 if (row + col > cur_max) {
463 cur_max = row + col;
464 b_diag = k;
465 }
466 if (row == len2) {
467 flower = k+1; nlower = 1;
468 }
469 if (col == len1) {
470 fupper = k-1; nupper = 1;
471 }
472 } //for k
473 k = cur_max*M_half - d * Op_cost;
474 if (max_row[d - 1] < k) {
475 max_row[d] = k;
476 return_val = d;
477 best_diag = b_diag;
478 *e2 = flast_d[d][b_diag];
479 *e1 = (*e2)+b_diag-ORIGIN;
480 } else {
481 max_row[d] = max_row[d - 1];
482 }
483 if (flower > fupper)
484 break;
485 d++;
486 if (!nlower) flower--;
487 if (!nupper) fupper++;
488 if (S==NULL)
489 flast_d[d] = flast_d[d - 2];
490 else {
491 // space array consists of ThreeVal structures which are
492 // 3 times larger than int, so divide requested amount by 3
493 flast_d[d] = (int*) get_mb_space(space, (fupper-flower+7)/3);
494 if (flast_d[d] != NULL)
495 flast_d[d] = flast_d[d] - flower + 2;
496 else
497 return return_val;
498 }
499 } //while d<=MAX_D
500
501 if (S!=NULL) { /*trace back*/
502 int row1, col1, diag1, diag;
503 d = return_val; diag = best_diag;
504 row = *e2; col = *e1;
505 while (d > 0) {
506 diag1 = get_last(flast_d, d, diag, &row1);
507 col1 = row1+diag1-ORIGIN;
508 if (diag1 == diag) {
509 if (row-row1 > 0) edit_script_rep(S, row-row1);
510 } else if (diag1 < diag) {
511 if (row-row1 > 0) edit_script_rep(S, row-row1);
512 edit_script_ins(S,1);
513 } else {
514 if (row-row1-1> 0) edit_script_rep(S, row-row1-1);
515 edit_script_del(S, 1);
516 }
517 d--; diag = diag1; col = col1; row = row1;
518 } //while d>0
519 edit_script_rep(S, flast_d[0][ORIGIN]);
520 if (!reverse)
521 edit_script_reverse_inplace(S);
522 }
523 return return_val;
524 }
525
526
527 #define GAPALIGN_SUB ((unsigned char)0) /*op types within the edit script*/
528 #define GAPALIGN_INS ((unsigned char)1)
529 #define GAPALIGN_DEL ((unsigned char)2)
530 #define GAPALIGN_DECLINE ((unsigned char)3)
531 char xgapcodes[4]={'S','I', 'D', 'X'};
532
533 struct GapXEditScript {
534 unsigned char op_type; // GAPALIGN_SUB, GAPALIGN_INS, or GAPALIGN_DEL
535 int num; // Number of operations
536 GapXEditScript* next;
537 GapXEditScript() {
538 op_type=0;
539 num=0;
540 next=NULL;
541 }
542 };
543
544 class CSeqGap { //
545 public:
546 int offset;
547 int len;
548 CSeqGap(int gofs=0,int glen=1) {
549 offset=gofs;
550 len=glen;
551 }
552 };
553
554 class CAlnGapInfo {
555 int a_ofs; //alignment start on seq a (0 based)
556 int b_ofs; //alignment start on seq b (0 based)
557 int a_len; //length of alignment on seq a
558 int b_len; //length of alignment on seq b
559 public:
560 GVec<CSeqGap> a_gaps;
561 GVec<CSeqGap> b_gaps;
562 CAlnGapInfo(SEditScript* ed_script, int astart=0, int bstart=0):a_gaps(),b_gaps() {
563 a_ofs=astart;
564 b_ofs=bstart;
565 a_len=0;
566 b_len=0;
567 for (int i=0; i<ed_script->num; i++) {
568 int num=((ed_script->op[i]) >> 2);
569 char op_type = 3 - ( ed_script->op[i] & EDIT_OP_MASK );
570 if (op_type == 3 || op_type < 0 )
571 GError("Error: encountered op_type %d in ed_script?!\n", (int)op_type);
572 CSeqGap gap;
573 switch (op_type) {
574 case GAPALIGN_SUB: a_len+=num;
575 b_len+=num;
576 break;
577 case GAPALIGN_INS: a_len+=num;
578 gap.offset=b_ofs+b_len;
579 gap.len=num;
580 b_gaps.Add(gap);
581 break;
582 case GAPALIGN_DEL: b_len+=num;
583 gap.offset=a_ofs+a_len;
584 gap.len=num;
585 a_gaps.Add(gap);
586 break;
587 }
588 }
589 }
590 void printSeqA(const char* s) {
591 int g=0;
592 int aend=a_ofs+a_len;
593 for (int i=a_ofs;i<aend;i++) {
594 if (g<a_gaps.Count() && a_gaps[g].offset==i) {
595 for (int j=0;j<a_gaps[g].len;j++) fprintf(stderr, "-");
596 g++;
597 }
598 fprintf(stderr, "%c", s[i]);
599 }
600 fprintf(stderr, "\n");
601 }
602 void printSeqB(const char* s) {
603 int g=0;
604 int bend=b_ofs+b_len;
605 for (int i=b_ofs;i<bend;i++) {
606 if (g<b_gaps.Count() && b_gaps[g].offset==i) {
607 for (int j=0;j<b_gaps[g].len;j++) fprintf(stderr, "-");
608 g++;
609 }
610 fprintf(stderr, "%c", s[i]);
611 }
612 fprintf(stderr, "\n");
613 }
614 };
615
616 GapXEditScript* EditScript2GapX(SEditScript* ed_script) {
617 GapXEditScript* esp_array=NULL;
618 uint i;
619 if (ed_script==NULL || ed_script->num == 0)
620 return NULL;
621 //GMALLOC(esp_array, ed_script->num * sizeof(GapXEditScript));
622 esp_array = new GapXEditScript[ed_script->num];
623 for (i=0; i<ed_script->num; i++) {
624 //GCALLOC(esp, sizeof(GapXEditScript));
625 //esp->num = EDIT_VAL(ed_script->op[i]);
626 //esp->num = ((ed_script->op[i]) >> 2);
627 esp_array[i].num=((ed_script->op[i]) >> 2);
628 //esp->op_type = 3 - EDIT_OPC(ed_script->op[i]);
629 //esp->op_type = 3 - ( ed_script->op[i] & EDIT_OP_MASK );
630 esp_array[i].op_type = 3 - ( ed_script->op[i] & EDIT_OP_MASK );
631 if (esp_array[i].op_type == 3)
632 //fprintf(stderr, "op_type = 3\n");
633 GError("Error: EditScript2GapX encountered op_type 3 ?!\n");
634 if (i>0)
635 esp_array[i-1].next=&esp_array[i];
636 }
637 return esp_array;
638 }
639
640
641
642 struct GappedHSPInfo {
643 const char *qseq;
644 int ql,qr;
645 const char *sseq;
646 int sl,sr;
647 int score;
648 double pid;
649 GapXEditScript* editscript;
650 CAlnGapInfo* gapinfo;
651 GappedHSPInfo(const char* q, int q_l, int q_r, const char* s, int s_l, int s_r,
652 int sc=0, double percid=0) {
653 qseq=q;
654 sseq=s;
655 ql=q_l;
656 qr=q_r;
657 sl=s_l;
658 sr=s_r;
659 score=sc;
660 pid=percid;
661 editscript=NULL;
662 gapinfo=NULL;
663 }
664 ~GappedHSPInfo() {
665 delete[] editscript;
666 delete gapinfo;
667 }
668 };
669
670 GappedHSPInfo* GreedyAlign(const char* q_seq, int q_off, const char* s_seq, int s_off,
671 int reward, int penalty, int xdrop) {
672
673 SEditScript* ed_script_fwd = edit_script_new();
674 SEditScript* ed_script_rev = edit_script_new();
675 int q_ext_l=0, q_ext_r=0, s_ext_l=0, s_ext_r=0;
676 int score=0;
677 int slen=strlen(s_seq); //subj
678 int qlen=strlen(q_seq); //query
679 const char* q=q_seq+q_off;
680 int q_avail=qlen-q_off;
681 const char* s=s_seq+s_off;
682 int s_avail=slen-s_off;
683 int MIN_GREEDY_SCORE=5; //minimum score for an alignment to be reported
684 GreedyAlignMem* abmp=new GreedyAlignMem(slen, reward, -penalty, xdrop);
685 // extend to the right
686 score = MegaBlastGreedyAlign(s, s_avail, q, q_avail, false, xdrop,
687 reward, -penalty, &s_ext_r, &q_ext_r, abmp, ed_script_fwd);
688 GMessage("fwd score=%d (up to a_r=%d, b_r=%d)\n", score, q_ext_r, s_ext_r);
689 /* extend to the left */
690 score += MegaBlastGreedyAlign(s_seq, s_off, q_seq, q_off, true, xdrop,
691 reward, -penalty, &s_ext_l, &q_ext_l, abmp, ed_script_rev);
692 GMessage("rvs score=%d (back to a_l=%d, b_l=%d)\n", score, q_ext_l, s_ext_l);
693 delete abmp;
694 GappedHSPInfo* ghsp=NULL;
695 // In basic case the greedy algorithm returns number of
696 // differences, hence we need to convert it to score
697 int retscore = (q_ext_r + s_ext_r + q_ext_l + s_ext_l)*reward/2 - score*(reward-penalty);
698 GMessage("Actual score: %d\n",retscore);
699 edit_script_append(ed_script_rev, ed_script_fwd); //combine the two extensions
700 if (retscore>=MIN_GREEDY_SCORE) {
701 ghsp=new GappedHSPInfo(q_seq, q_ext_l,q_ext_r, s_seq, s_ext_l, s_ext_r);
702 int hsp_length = GMIN(q_ext_l+q_ext_r, s_ext_l+s_ext_r);
703 ghsp->score=retscore;
704 ghsp->pid = 100 * (1 - ((double) score) / hsp_length);
705 //ghsp->editscript = EditScript2GapX(ed_script_rev);
706
707 ghsp->gapinfo = new CAlnGapInfo(ed_script_rev, ghsp->ql, ghsp->sl);
708 edit_script_free(ed_script_rev);
709 edit_script_free(ed_script_fwd);
710 }
711 return ghsp;
712 }
713
714 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
715 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
716
717 char* seq_cleanup(const char* s) {
718 char* r=NULL;
719 int slen=strlen(s);
720 GMALLOC(r, slen+1);
721 int newlen=0;
722 for (int i=0;i<slen;i++) {
723 if (s[i]<'A' || s[i]>'z' || (s[i]>'Z' && s[i]<'a')) continue;
724 r[newlen]=s[i];
725 newlen++;
726 }
727 r[newlen]=0;
728 return r;
729 }
730
731 void loadSeqs(const char* fname, char*& s1, char*& s2) {
732 GLineReader lr(fname);
733 char* l=NULL;
734 s1=NULL;
735 s2=NULL;
736 while ((l=lr.nextLine())!=NULL) {
737 if (lr.length()<=3 || l[0]=='#') continue;
738 if (s1==NULL) {
739 s1=seq_cleanup(l);
740 }
741 else if (s2==NULL) s2=seq_cleanup(l);
742 else break;
743 }
744 }
745
746 //================= MAIN =====================
747
748 int main(int argc, char * const argv[]) {
749 GArgs args(argc, argv, "a:b:i:");
750 int e;
751 if ((e=args.isError())>0)
752 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
753 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
754 char* seqa=NULL;
755 char* seqb=NULL;
756 GStr s=args.getOpt('i');
757 if (!s.is_empty()) loadSeqs(s.chars(), seqa, seqb);
758 else {
759 s=args.getOpt('a');
760 if (!s.is_empty()) seqa=seq_cleanup(s.chars());
761 s=args.getOpt('b');
762 if (!s.is_empty()) seqb=seq_cleanup(s.chars());
763 }
764 if (seqa==NULL || seqb==NULL) GError("Error: both sequences are required!\n");
765 GMessage("seqa: %s\n",seqa);
766 GMessage("seqb: %s\n",seqb);
767 //offsets must be in the middle of a good (best) HSP (ungapped match)
768 GappedHSPInfo* ghsp=GreedyAlign(seqa, 0, seqb, 0, 2, -3, 22);
769 if (ghsp!=NULL) {
770 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", ghsp->ql, ghsp->qr,
771 ghsp->sl, ghsp->sr, ghsp->score, ghsp->pid);
772 if (ghsp->editscript!=NULL) {
773 GMessage("Edit script: ");
774 GapXEditScript* ges=ghsp->editscript;
775 do {
776 GMessage("%d%c ",ges->num, xgapcodes[ges->op_type]);
777 } while ((ges=ges->next)!=NULL);
778 GMessage("\n");
779 }
780 if (ghsp->gapinfo!=NULL) {
781 GMessage("Alignment:\n");
782 ghsp->gapinfo->printSeqA(seqa);
783 ghsp->gapinfo->printSeqB(seqb);
784 }
785 delete ghsp;
786 }
787 //---- done here
788 GFREE(seqa);
789 GFREE(seqb);
790 }