ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/mblasm/GapAssem.h
Revision: 148
Committed: Thu Dec 22 21:02:45 2011 UTC (12 years, 8 months ago) by gpertea
File size: 14155 byte(s)
Log Message:
added mblasm sources

Line User Rev File contents
1 gpertea 148 #ifndef G_GAP_ASSEM_DEFINED
2     #define G_GAP_ASSEM_DEFINED
3     #include "GFastaFile.h"
4     #include "gdna.h"
5     #include "GList.hh"
6     #include "GHash.hh"
7     #include <ctype.h>
8    
9     class GSeqAlign;
10     class MSAColumns;
11    
12     struct SeqDelOp {
13     int pos;
14     bool revcompl;
15     SeqDelOp(int p, bool r) { pos=p; revcompl=r;}
16     //
17     bool operator==(SeqDelOp& d){
18     return (this==&d);
19     }
20     bool operator>(SeqDelOp& d){
21     return (this>&d);
22     }
23     bool operator<(SeqDelOp& d){
24     return (this<&d);
25     }
26    
27     };
28    
29     class GASeq : public FastaSeq {
30     protected:
31     int numgaps; //total number of accumulated gaps in this sequence
32     short *ofs; //array of gaps at each position;
33     //a negative value (-1) means DELETION of the nucleotide
34     //at that position!
35    
36     #ifdef ALIGN_COVERAGE_DATA
37     int* cov; //coverage of every nucleotide of this seq
38     // it starts with 0 by itself
39     // it'll be decreased by -1 for mismatching ends!
40     #endif
41     GList<SeqDelOp>* delops; //delete operations
42     public:
43     unsigned char flags; //8 general purpose boolean flags (bits)
44     // bad_align flag is the last bit -- i.e. bit 7
45     // all the others (0..6) are free for custom use
46     GSeqAlign* msa;
47     int msaidx; //actual index at which this sequence is to be found in GASeqAlign;
48     int seqlen; // exactly the size of ofs[]
49     int offset; //offset in the layout
50     int ng_ofs; //non-gapped offset in the layout
51     //(approx, for clipping constraints only)
52     char revcompl; //0 = forward, 1=reverse
53     int ext5; // layout-positional extension at 5' end
54     int ext3; // layout-positional extension at 3' end
55     //--
56     int clp5; //ever "disproved" region at 5' end
57     int clp3; //ever "disproved" region at 3' end
58     //------- comparison operators (for GList) :
59     //sorting by offset in cluster
60     void allupper() {
61     for (int i=0;i<len;i++) {
62     seq[i]=toupper(seq[i]);
63     }
64     }
65     void reverseComplement() {
66     if (len==0) return;
67     //ntCompTableInit();
68     reverseChars(seq,len);
69     for (int i=0;i<len;i++) seq[i]=ntComplement(seq[i]);
70     }
71     bool operator==(GASeq& d){
72     return (offset==d.offset && strcmp(id,d.id)==0);
73     }
74     bool operator>(GASeq& d){
75     return (offset>d.offset);
76     }
77     bool operator<(GASeq& d){
78     return (offset<d.offset);
79     }
80     friend class GSeqAlign;
81     //-------------------------------
82     GASeq(char* sname, char* sdesrc=NULL, char* sseq=NULL);
83     GASeq(char* sname, int soffset, int slen, int sclipL=0, int sclipR=0, char rev=0);
84     ~GASeq();
85     void refineClipping(char* cons, int cons_len, int cpos, bool skipDels=false);
86     void setGap(int pos, short gaplen=1); // set the gap in this pos
87     void addGap(int pos, short gapadd); //extend the gap in this pos
88     //bitno is 0 based here, for simplicity:
89     inline void setFlag(unsigned char bitno) { flags |= ((unsigned char)1 << bitno); }
90     inline void clearFlag(unsigned char bitno) { flags ^= ((unsigned char)1 << bitno); }
91     inline bool hasFlag(unsigned char bitno) { return ( (((unsigned char)1 << bitno) & flags) !=0 ); }
92     int getNumGaps() { return numgaps; }
93     int gap(int pos) { return ofs[pos]; }
94     void removeBase(int pos); //remove the nucleotide at that position
95     int endOffset() { return offset+seqlen+numgaps; }
96     int endNgOffset() { return ng_ofs+seqlen; }
97     int removeClipGaps(); //remove gaps within clipped regions
98     //offset should be corrected appropriately!
99     void printGappedSeq(FILE* f, int baseoffs=0);
100     void printGappedSeq(int baseoffs=0) { printGappedSeq(stdout, baseoffs); }
101     void printGappedFasta(FILE* f);
102     void loadProcessing(); //to be called immediately after loading the sequence
103     // it will revCompl if needed and apply delops
104     #ifdef ALIGN_COVERAGE_DATA
105     void addCoverage(GASeq* s);
106     #endif
107     void reverseGaps(); //don't update offset and flags
108     //useful after reading mgblast gap info
109     void revComplement(int alignlen=0);
110     void toMSA(MSAColumns& msa);
111     };
112    
113     // -- nucleotide origin -- for every nucleotide in a MSA column
114     // this info is needed by SNP reporting
115     class NucOri {
116     public:
117     GASeq* seq;
118     int pos; //0-based position of nucleotide letter
119     NucOri() { seq=NULL; pos=0; }
120     NucOri(GASeq* s, int p) { seq=s; pos=p; }
121     bool operator==(NucOri& d){
122     return (strcmp(seq->id,d.seq->id)==0 && pos==d.pos);
123     }
124     bool operator>(NucOri& d){
125     int cmp=strcmp(seq->id,d.seq->id);
126     if (cmp==0) return pos>d.pos;
127     else return cmp>0;
128     }
129     bool operator<(NucOri& d){
130     int cmp=strcmp(seq->id,d.seq->id);
131     if (cmp==0) return pos<d.pos;
132     else return cmp<0;
133     }
134     };
135    
136     struct SeqClipOp {
137     GASeq* seq;
138     int clp[2];
139     SeqClipOp(GASeq* s, int newclp5, int newclp3=-1) {
140     seq=s;
141     clp[0]=newclp5;
142     clp[1]=newclp3;
143     }
144     //--
145     bool operator==(SeqClipOp& d){
146     return (this==&d);
147     }
148     bool operator>(SeqClipOp& d){
149     return (this>&d);
150     }
151     bool operator<(SeqClipOp& d){
152     return (this<&d);
153     }
154     };
155    
156     class AlnClipOps :public GList<SeqClipOp> {
157     public:
158     char q_rev;
159     int d5;
160     int d3;
161     //---
162     int total;
163     AlnClipOps():GList<SeqClipOp>(false,true,false) {
164     total=0;
165     d5=0;
166     d3=0;
167     q_rev=false;
168     }
169     bool add5(GASeq* s, int clp, float clipmax) {
170     if (s->clp5<clp) {
171     if (clipmax>0) {
172     int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen);
173     if (clp>maxovh) return false;
174     }
175     //----- base test: the read should be left with no less than 25% of its length
176     if (s->seqlen-s->clp3-clp < (s->seqlen >> 2)) return false;
177     total+=10000+clp-s->clp5;
178     Add(new SeqClipOp(s,clp,-1));
179     }
180     return true;
181     }
182     bool add3(GASeq* s, int clp, float clipmax) {
183     if (s->clp3<clp) {
184     if (clipmax>0) {
185     int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen);
186     if (clp>maxovh) return false;
187     }
188     //----- base test: if the read is left with less than 25% of its length
189     if (s->seqlen-s->clp5-clp < (s->seqlen >> 2)) return false;
190     total+= 10000+clp-s->clp3;
191     Add(new SeqClipOp(s,-1,clp));
192     }
193     return true;
194     }
195     bool add(GASeq* s, int clp5, int clp3, float clipmax) {
196     int newclp5=-1;
197     int newclp3=-1;
198     int add=0;
199     if (s->clp5<clp5) {
200     if (clipmax>0) {
201     int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen);
202     if (clp5>maxovh) return false;
203     }
204     //----- base test: if the read is left with less than 25% of its length!
205     if (s->seqlen-s->clp3-clp5 < (s->seqlen >> 2)) return false;
206     add+= 10000+clp5-s->clp5;
207     newclp5=clp5;
208     }
209     else clp5=s->clp5;
210     if (s->clp3<clp3) {
211     if (clipmax>0) {
212     int maxovh = clipmax>1 ? (int)clipmax : iround(clipmax * (float)s->seqlen);
213     if (clp3>maxovh) return false;
214     }
215     //----- base test: if the read is left with less than 25% of its length!
216     if (s->seqlen-clp5-clp3 < (s->seqlen >> 2)) return false;
217     add+=10000+clp3-s->clp3;
218     newclp3=clp3;
219     }
220     if (add>0) {
221     total+=add;
222     Add(new SeqClipOp(s,newclp5,newclp3));
223     }
224     return true;
225     }
226     };
227    
228     class GAlnColumn {
229     protected:
230     struct NucCount {
231     char c; // A, C, G, T, N or -
232     // precisely in this order!
233     int count;
234     void set(char l, int num=0) { c=l;count=num; }
235     };
236     enum { ncA=0, ncC, ncG, ncT, ncN, ncGap };
237     NucCount counts[6];
238     bool countsSorted;
239     public:
240     bool hasClip;
241     char consensus;
242     int layers; //total "thickness"
243     NucOri* clipnuc;
244     GList<NucOri>* nucs;
245     friend int qsortnuc(const void* p1, const void* p2);
246     //int total() { return numgaps+numN+numA()+numC()+numG()+numT(); }
247     GAlnColumn() { //sorted?, free?, unique?
248     clipnuc=NULL;
249     nucs=new GList<NucOri>(false,true,false);
250     /*lstC=new GList<NucOri>(false,true,false);
251     lstG=new GList<NucOri>(false,true,false);
252     lstT=new GList<NucOri>(false,true,false);*/
253     countsSorted=false;
254     counts[ncA].set('A');
255     counts[ncC].set('C');
256     counts[ncG].set('G');
257     counts[ncT].set('T');
258     counts[ncN].set('N');
259     counts[ncGap].set('-');
260     hasClip=false;
261     layers=0;
262     consensus=0;
263     }
264     ~GAlnColumn() {
265     delete nucs;
266     if (clipnuc!=NULL) delete clipnuc;
267     }
268     void addGap() { counts[ncGap].count++;
269     layers++; //-- Not a "layer", actually
270     //numgaps++;
271     }
272     void addNuc(GASeq* seq, int pos, bool clipped=false) {
273     //assumes the seq is already loaded and reverse complemented if necessary
274     //position is precisely where it should be
275     if (clipped) {
276     if (hasClip==false) {
277     hasClip=true;
278     clipnuc = new NucOri(seq,pos);
279     }
280     return;
281     }
282     char c=(char)toupper(seq->seq[pos]);
283     switch (c) {
284     case 'A':nucs->Add(new NucOri(seq,pos));
285     counts[ncA].count++;
286     layers++;
287     break;
288     case 'C':nucs->Add(new NucOri(seq,pos));
289     counts[ncC].count++;
290     layers++;
291     break;
292     case 'G':nucs->Add(new NucOri(seq,pos));
293     counts[ncG].count++;
294     layers++;
295     break;
296     case 'T':nucs->Add(new NucOri(seq,pos));
297     counts[ncT].count++;
298     layers++;
299     break;
300     case '-': //this shouldn't be the case!
301     case '*':counts[ncGap].count++;
302     layers++;
303     //numgaps++;
304     break;
305     default: nucs->Add(new NucOri(seq,pos));
306     counts[ncN].count++;
307     layers++;
308     //numN++;
309     }//switch
310     }
311    
312     char bestChar();
313     void remove(); //removes a nucleotide from all involved sequences
314     //adjust all affected offsets in the alignment
315     };
316    
317     // A MSA columns container
318     class MSAColumns {
319     int size;
320     public:
321     static bool removeConsGaps;
322     static bool refineClipping;
323     GAlnColumn* columns;
324     int baseoffset;
325     int mincol;
326     int maxcol;
327     MSAColumns(int len, int baseofs=0) {
328     columns=new GAlnColumn[len];
329     size=len;
330     baseoffset=baseofs;
331     mincol=INT_MAX;
332     maxcol=0;
333     }
334     ~MSAColumns() {
335     size=0;
336     baseoffset=0;
337     delete[] columns;
338     }
339     GAlnColumn& operator[](int idx) {
340     if (idx<0 || idx>=size)
341     GError("MSAColumns op[]: bad index %d (size=%d)\n", idx,size);
342     return columns[idx];
343     }
344     int len() { return maxcol-mincol+1; }
345     void updateMinMax(int minc, int maxc) {
346     if (minc<mincol) mincol=minc;
347     if (maxc>maxcol) maxcol=maxc;
348     }
349     };
350    
351    
352     //-----------------------------------------------
353     // a sequence alignment: could be pairwise or MSA
354     class GSeqAlign :public GList<GASeq> {
355     static unsigned int counter;
356     int length;
357     int minoffset;
358     int consensus_cap;
359     void buildMSA();
360     void ErrZeroCov(int col);
361     public:
362     bool refinedMSA; //if refineMSA() was applied
363     MSAColumns* msacolumns;
364     unsigned int ordnum; //order number -- when it was created
365     // the lower the better (earlier=higher score)
366     int ng_len; //ungapped length and minoffset (approximative,
367     int ng_minofs; // for clipping constraints only)
368     int badseqs;
369     char* consensus; //consensus sequence (built by refineMSA())
370     int consensus_len;
371     friend class GASeq;
372     bool operator==(GSeqAlign& d){
373     return (this==&d);
374     }
375     bool operator>(GSeqAlign& d){
376     return (this>&d);
377     }
378     bool operator<(GSeqAlign& d){
379     return (this<&d);
380     }
381     //--
382     GSeqAlign():GList<GASeq>(true,true,false) {
383     //default is: sorted by GASeq offset, free nodes, non-unique
384     msacolumns=NULL;
385     ordnum=0;
386     length=0;
387     ng_len=0;
388     minoffset=0;
389     ng_minofs=0;
390     badseqs=0;
391     refinedMSA=false;
392     consensus=NULL;
393     consensus_len=0;
394     consensus_cap=0;
395     }
396     GSeqAlign(bool sorted, bool free_elements=true, bool beUnique=false)
397     :GList<GASeq>(sorted,free_elements,beUnique) {
398     ordnum=0;
399     length=0;
400     minoffset=0;
401     ng_len=0;
402     ng_minofs=0;
403     badseqs=0;
404     msacolumns=NULL;
405     consensus_len=0;
406     consensus_cap=0;
407     }
408     void incOrd() { ordnum = ++counter; }
409     //first time creation from a pairwise alignment:
410     #ifdef ALIGN_COVERAGE_DATA
411     GSeqAlign(GASeq* s1, int l1, int r1, GASeq* s2, int l2, int r2);
412     #else
413     GSeqAlign(GASeq* s1, GASeq* s2);
414     #endif
415     ~GSeqAlign() {
416     if (msacolumns!=NULL) delete msacolumns;
417     if (consensus!=NULL) GFREE(consensus);
418     }
419     int len() { return length; }
420    
421     void revComplement();
422     void addSeq(GASeq* s, int soffs, int ngofs);
423     void injectGap(GASeq* seq, int pos, int xgap);
424     void removeBase(GASeq* seq, int pos);
425     void extendConsensus(char c);
426     //try to propagate the planned trimming of a read
427     //to the whole MSA containing it
428     // returns false if too much is trimmed of any component read
429     void applyClipping(AlnClipOps& clipops);
430    
431     bool evalClipping(GASeq* seq, int c5, int c3, float clipmax,
432     AlnClipOps& clipops);
433    
434     //merge other alignment into this msa
435     //seq->id MUST be the same with oseq->id
436     // *if OK, gaps AND coverage values are propagated
437     // and omsa is deallocated
438     // *if not OK <=> the layout doesn't accept the merge
439     // due to clipmax constraint, then nothing happens
440     bool addAlign(GASeq* seq, GSeqAlign* omsa, GASeq* oseq);
441     void print(FILE* f, char c=0);
442     void print() { print(stdout); }
443     void removeColumn(int column);
444     void freeMSA();
445     void refineMSA(bool redo_ends=false);
446     // find consensus, refine clipping, remove gap-columns
447     void writeACE(FILE* f, const char* name);
448     void writeInfo(FILE* f, const char* name);
449     };
450    
451     int compareOrdnum(void* p1, void* p2);
452     int compareCounts(void* p1, void* p2);
453    
454     #endif