ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/seqalign/seqalign.cpp
Revision: 92
Committed: Mon Oct 3 21:36:42 2011 UTC (12 years, 11 months ago) by gpertea
File size: 8038 byte(s)
Log Message:
switching to gclib/GAlnExtend

Line File contents
1 #include "GBase.h"
2 #include "GArgs.h"
3 #include "GStr.h"
4 #include "GList.hh"
5 #include "GAlnExtend.h"
6
7 #define USAGE "Usage:\n\
8 seqalign {-i <2line_file> | -a <seq1> -b <seq2>} \\ \n\
9 [X=<xdrop>] [M=<match_reward>] [N=<mismatch_penalty]\
10 "
11
12 bool debug=true;
13 int Xdrop=8;
14 int match_reward=2;
15 int mismatch_penalty=3;
16
17 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
18 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
19
20 char* seq_cleanup(const char* s, int& s_len) {
21 char* r=NULL;
22 int slen=strlen(s);
23 GMALLOC(r, slen+1);
24 int newlen=0;
25 for (int i=0;i<slen;i++) {
26 if (s[i]<'A' || s[i]>'z' || (s[i]>'Z' && s[i]<'a')) continue;
27 r[newlen]=s[i];
28 newlen++;
29 }
30 r[newlen]=0;
31 s_len=newlen;
32 return r;
33 }
34
35 bool loadSeqs(GLineReader& lr, char*& s1, char*& s2,
36 int& s1_len, int& s2_len) {
37 char* l=NULL;
38 s1=NULL;
39 s2=NULL;
40 while ((l=lr.nextLine())!=NULL) {
41 if (lr.length()<=3 || l[0]=='#') continue;
42 if (s1==NULL) {
43 s1=seq_cleanup(l, s1_len);
44 }
45 else if (s2==NULL) {
46 s2=seq_cleanup(l, s2_len);
47 return true;
48 }
49 else break;
50 }
51 if (s1!=NULL) GFREE(s1);
52 return false;
53 }
54
55
56 GXAlnInfo* match_RightEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, SGreedyAlignMem* gxmem) {
57 bool editscript=false;
58 #ifdef GDEBUG
59 editscript=true;
60 GMessage("==========> matching Right (3') end ========\n");
61 #endif
62 GList<GXSeed> rseeds(true,true,false);
63 GXBandSet* alnbands=collectSeeds_R(rseeds, seqa, seqa_len, seqb, seqb_len);
64 GList<GXAlnInfo> galns(true,true,false);
65 int max_top_bands=5;
66 int top_band_count=0;
67 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
68 for (int b=0;b<alnbands->Count();b++) {
69 if (alnbands->Get(b)->score<4) break;
70 top_band_count++;
71 GXBand& band=*(alnbands->Get(b));
72 band.seeds.setSorted(cmpSeedScore);
73 anchor_seeds.Add(band.seeds.First());
74 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
75 }
76 #ifdef GDEBUG
77 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
78 #endif
79
80 for (int i=0;i<anchor_seeds.Count();i++) {
81 GXSeed& aseed=*anchor_seeds[i];
82 int a1=aseed.a_ofs+(aseed.len>>1)+1;
83 int a2=aseed.b_ofs+(aseed.len>>1)+1;
84 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
85 seqb, a2, seqb_len, galn_TrimRight, editscript, gxmem,
86 match_reward, mismatch_penalty, Xdrop);
87 if (alninfo && alninfo->pid>73 && alninfo->sr>seqb_len-4 && alninfo->ql<4)
88 galns.AddIfNew(alninfo, true);
89 else delete alninfo;
90 }
91
92 #ifdef GDEBUG
93 for (int i=0;i<galns.Count();i++) {
94 GXAlnInfo* alninfo=galns[i];
95 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
96 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
97 if (alninfo->gapinfo!=NULL) {
98 GMessage("Alignment:\n");
99 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
100 }
101 }
102 #endif
103 //---- done
104 delete alnbands;
105 if (galns.Count()) {
106 GXAlnInfo* bestaln=galns.Shift();
107 return bestaln;
108 }
109 else return NULL;
110 }
111
112 GXAlnInfo* match_LeftEnd(const char* seqa, int seqa_len, const char* seqb, int seqb_len, SGreedyAlignMem* gxmem) {
113 bool editscript=false;
114 #ifdef GDEBUG
115 editscript=true;
116 GMessage("==========> matching Left (5') end ========\n");
117 #endif
118 GList<GXSeed> rseeds(true,true,false);
119 GXBandSet* alnbands=collectSeeds_L(rseeds, seqa, seqa_len, seqb, seqb_len);
120 GList<GXAlnInfo> galns(true,true,false);
121 int max_top_bands=5;
122 int top_band_count=0;
123 GList<GXSeed> anchor_seeds(cmpSeedDiag, NULL, true); //stores unique seeds per diagonal
124 for (int b=0;b<alnbands->Count();b++) {
125 if (alnbands->Get(b)->score<4) break;
126 top_band_count++;
127 GXBand& band=*(alnbands->Get(b));
128 band.seeds.setSorted(cmpSeedScore);
129 anchor_seeds.Add(band.seeds.First());
130 if (anchor_seeds.Count()>2 || top_band_count>max_top_bands) break;
131 }
132 #ifdef GDEBUG
133 GMessage("::: Collected %d anchor seeds.\n",anchor_seeds.Count());
134 #endif
135
136 for (int i=0;i<anchor_seeds.Count();i++) {
137 GXSeed& aseed=*anchor_seeds[i];
138 int a1=aseed.a_ofs+(aseed.len>>1)+1;
139 int a2=aseed.b_ofs+(aseed.len>>1)+1;
140 GXAlnInfo* alninfo=GreedyAlignRegion(seqa, a1, seqa_len,
141 seqb, a2, seqb_len, galn_TrimLeft, editscript, gxmem,
142 match_reward, mismatch_penalty, Xdrop-2);
143 if (alninfo && alninfo->pid>82 && alninfo->sl<3 && alninfo->qr>seqa_len-4)
144 //TODO: at 5' end we should use a higher pid threshold
145 // we could also decrease the X-drop value!
146 galns.AddIfNew(alninfo, true);
147 else delete alninfo;
148 }
149
150 #ifdef GDEBUG
151 for (int i=0;i<galns.Count();i++) {
152 GXAlnInfo* alninfo=galns[i];
153 GMessage("a(%d..%d) align to b(%d..%d), score=%d, pid=%4.2f\n", alninfo->ql, alninfo->qr,
154 alninfo->sl, alninfo->sr, alninfo->score, alninfo->pid);
155 if (alninfo->gapinfo!=NULL) {
156 GMessage("Alignment:\n");
157 alninfo->gapinfo->printAlignment(stderr, seqa, seqa_len, seqb, seqb_len);
158 }
159 }
160 #endif
161 //---- done
162 delete alnbands;
163 if (galns.Count()) {
164 GXAlnInfo* bestaln=galns.Shift();
165 return bestaln;
166 }
167 else return NULL;
168 }
169
170
171
172 void findTrimming(char* aseq, int alen, char* rseq, int rlen, SGreedyAlignMem* gxmem=NULL) {
173
174 GXAlnInfo* r_bestaln=match_RightEnd(aseq, alen, rseq, rlen, gxmem);
175
176 #ifdef GDEBUG
177 if (r_bestaln)
178 GMessage("******** Best 3' Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
179 r_bestaln->ql, r_bestaln->qr, r_bestaln->sl, r_bestaln->sr, r_bestaln->score, r_bestaln->pid);
180 else GMessage("******** No suitable 3' alignment found.\n");
181 #endif
182 if (r_bestaln)
183 fprintf(stdout, "3prime %d-%d:%d-%d|score=%d|pid=%.2f\n",
184 r_bestaln->ql, r_bestaln->qr, r_bestaln->sl, r_bestaln->sr, r_bestaln->score, r_bestaln->pid);
185
186 delete r_bestaln;
187
188 //testing: reverse both sequences and redo the search at 5' end
189 reverseChars(aseq);
190 reverseChars(rseq);
191
192 GXAlnInfo* l_bestaln=match_LeftEnd(aseq, alen, rseq, rlen, gxmem);
193
194 #ifdef GDEBUG
195 if (l_bestaln)
196 GMessage("******** Best 5' Alignment: %d-%d vs %d-%d, score=%d, pid=%.2f\n",
197 l_bestaln->ql, l_bestaln->qr, l_bestaln->sl, l_bestaln->sr, l_bestaln->score, l_bestaln->pid);
198 else GMessage("******** No suitable 5' alignment found.\n");
199 #endif
200 if (l_bestaln)
201 fprintf(stdout, "5prime %d-%d:%d-%d|score=%d|pid=%.2f\n",
202 l_bestaln->ql, l_bestaln->qr, l_bestaln->sl, l_bestaln->sr, l_bestaln->score, l_bestaln->pid);
203
204 delete l_bestaln;
205
206 GFREE(aseq);
207 GFREE(rseq);
208 }
209
210 //================= MAIN =====================
211
212 int main(int argc, char * const argv[]) {
213 GArgs args(argc, argv, "X=M=N=a:b:i:");
214 int e;
215 if ((e=args.isError())>0)
216 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
217 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
218 char* seqa=NULL;
219 char* seqb=NULL;
220 GStr s=args.getOpt('X');
221 if (!s.is_empty())
222 Xdrop=s.asInt();
223 s=args.getOpt('M');
224 if (!s.is_empty())
225 match_reward=s.asInt();
226 s=args.getOpt('N');
227 if (!s.is_empty())
228 mismatch_penalty=s.asInt();
229
230 s=args.getOpt('i');
231 int seqa_len=0, seqb_len=0;
232 if (!s.is_empty()) {
233 GLineReader lr(s.chars());
234 SGreedyAlignMem* gxmem=new SGreedyAlignMem(match_reward, mismatch_penalty, Xdrop);
235 while (loadSeqs(lr, seqa, seqb, seqa_len, seqb_len)) {
236 findTrimming(seqa, seqa_len, seqb, seqb_len, gxmem);
237 }
238 delete gxmem;
239 }
240 else {
241 s=args.getOpt('a');
242 if (!s.is_empty()) seqa=seq_cleanup(s.chars(), seqa_len);
243 s=args.getOpt('b');
244 if (!s.is_empty()) seqb=seq_cleanup(s.chars(), seqb_len);
245 if (seqa==NULL || seqa_len<2 || seqb==NULL || seqb_len<2)
246 GError("Error: two sequences with length>1 should be given!\n");
247 findTrimming(seqa, seqa_len, seqb, seqb_len);
248 }
249
250 }