ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/mblasm/mblaor.cpp
Revision: 148
Committed: Thu Dec 22 21:02:45 2011 UTC (12 years, 9 months ago) by gpertea
File size: 14301 byte(s)
Log Message:
added mblasm sources

Line File contents
1 #include <stdio.h>
2 #include <ctype.h>
3 #include <math.h>
4 #include "GArgs.h"
5 #include "GStr.h"
6 #include "GHash.hh"
7 #include "GList.hh"
8 #include "GCdbYank.h"
9 #include "GapAssem.h"
10 #define USAGE "Usage:\n\
11 mblaor <nrcl_layouts_w_gapinfo> -d <fastadb.cidx> [-r <ref_db.cidx]\n\
12 [-o <outfile_ace>] [-c <clipmax[%]>] [-p <ref_prefix>]\n\
13 \n\
14 <nrcl_layouts_w_gapinfo> is the .lyt file with alignment information\n\
15 as produced by nrcl\n\
16 -d cdb index (created with cdbfasta) for a multi-fasta file containing\n\
17 the actual nucleotide sequences of the components (required)\n\
18 -r cdb index for a multi-fasta file with the reference sequences\n\
19 (only needed if they are not given in the input .lyt file nor\n\
20 in <fastadb>)\n\
21 -p only consider a nrcl cluster if the reference sequence name starts\n\
22 with <ref_prefix>\n\
23 -c maximum clipping allowed for component sequences when\n\
24 mapped onto a reference; if it ends with '%' then maximum\n\
25 clipping is estimated for each read as <clipmax> percent\n\
26 of its length\n\
27 -o the output acefile is written to <outfile.ace> instead of stdout\n\
28 -G do not remove consensus gaps (default is to edit sequences\n\
29 in order to remove gap-dominated columns in MSA)\n"
30
31 #define LOG_MSG_CLIPMAX "Overlap between %s and reference %s rejected due to clipmax=%4.2f constraint.\n"
32 #define LOG_MSG_OVLCLIP "Overlap between %s and %s invalidated by the \
33 %dnt clipping of %s at %d' end.\n"
34 //-------- global variables
35 bool debugMode=false;
36 bool removeConsGaps=false;
37 char* ref_prefix=NULL;
38 bool verbose=false;
39 int rlineno=0;
40
41 static FILE* outf;
42 static GHash<GASeq> seqs(false);
43 static GList<GSeqAlign> alns(true, true,false);
44 // sorted, free element, not unique
45
46 // flag for reference status:
47 static const unsigned char flag_IS_REF=0;
48
49 //this is only for tree/transitive embedding:
50 static const unsigned char flag_HAS_PARENT=1;
51
52 float clipmax=0;
53 //--------------------------------
54 class RefAlign {
55 char* linecpy;
56 int lineno; //input file line#
57 char* gpos;
58 char* gpos_onref;
59 char* gaps;//char string with gaps in this read
60 char* gaps_onref;//char string with gaps in reference
61 public:
62 char* seqname; //aligned read
63 int seqlen; //length of this read
64 int offset; //offset of this read relative to reference
65 int clip5;
66 int clip3;
67 char reverse; //is this reversed?
68 void parseErr(int fldno);
69 RefAlign(char* line, int len, int lno);
70 ~RefAlign();
71 int nextRefGap(int& pos);
72 int nextSeqGap(int& pos);
73 };
74
75 void loadAlnSeqs(GSeqAlign* aln, GCdbYank* cdbynk, GCdbYank* refcdb=NULL);
76 void printDebugAln(FILE* f, GSeqAlign* aln, int num, GCdbYank* cdbynk, GCdbYank* refcdb=NULL);
77
78 //returns the end of a space delimited token
79 void skipSp(char*& p) {
80 while (*p==' ' || *p=='\t' || *p=='\n') p++;
81 }
82 char* endSpToken(char* str) {
83 if (str==NULL || *str==0) return NULL;
84 char* p=str;
85 for (;*p!=0;p++)
86 if (*p==' ' || *p=='\t' || *p=='\n') return p;
87 return p; // *p is '\0'
88 }
89
90 //-- prepareMerge checks clipping and even when no clipmax is given,
91 // adjusts clipping as appropriately
92
93 //========================================================
94 //==================== main =====================
95 //========================================================
96 int main(int argc, char * const argv[]) {
97 //GArgs args(argc, argv, "DGvd:o:c:");
98 GArgs args(argc, argv, "DGvd:p:r:o:c:");
99 int e;
100 if ((e=args.isError())>0)
101 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
102 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
103 debugMode=(args.getOpt('D')!=NULL);
104 removeConsGaps=(args.getOpt('G')==NULL);
105 verbose=(args.getOpt('v')!=NULL);
106 if (debugMode) verbose=true;
107 MSAColumns::removeConsGaps=removeConsGaps;
108 GStr infile;
109 if (args.startNonOpt()) {
110 infile=args.nextNonOpt();
111 //GMessage("Given file: %s\n",infile.chars());
112 }
113 //==
114 FILE* inf;
115 if (!infile.is_empty()) {
116 inf=fopen(infile, "r");
117 if (inf==NULL)
118 GError("Cannot open input file %s!\n",infile.chars());
119 }
120 else
121 inf=stdin;
122 GStr s=args.getOpt('c');
123 if (!s.is_empty()) {
124 bool ispercent=(s[-1]=='%');
125 if (ispercent) s.trimR('%');
126 int c=s.asInt();
127 if (c<=0) GError("Error: invalid -c <clipmax> (%d) option provided (must be "
128 "a positive integer)!\n",c);
129 if (ispercent && c>99)
130 GError("Error: invalid percent value (%d) for -c option "
131 " (must be an integer between 1 and 99)!\n",c);
132 if (ispercent) {
133 clipmax = float(c)/100;
134 int maxp=iround(100*clipmax);
135 if (verbose) fprintf(stderr,
136 "Percentual max clipping set to %d%%\n",
137 maxp);
138 }
139 else {
140 clipmax=c;
141 if (verbose) fprintf(stderr, "Max clipping set to %d bases\n",
142 c);
143 }
144
145 } //clipmax option
146 ref_prefix=args.getOpt('p');
147 GStr outfile=args.getOpt('o');
148 if (!outfile.is_empty()) {
149 outf=fopen(outfile, "w");
150 if (outf==NULL)
151 GError("Cannot open file %s for writing!\n",outfile.chars());
152 }
153 else outf=stdout;
154 //************************************************
155
156
157 GStr dbidx=args.getOpt('d');
158 if (dbidx.is_empty())
159 GError("%sError: a cdb index of a fasta file must be provided!\n",USAGE);
160 GCdbYank* cdbyank=new GCdbYank(dbidx.chars());
161 GCdbYank* refcdb = NULL;
162 dbidx=args.getOpt('r');
163 if (!dbidx.is_empty())
164 refcdb=new GCdbYank(dbidx.chars());
165
166 GLineReader* linebuf=new GLineReader(inf);
167 char* line;
168 alns.setSorted(compareOrdnum);
169 GASeq* refseq=NULL; // current reference sequence
170 int ref_numseqs=0;
171 int ref_len=0;
172 int ref_lend=0, ref_rend=0;
173 int gaplen,gappos;
174 bool skipRefContig=false;
175 while ((line=linebuf->getLine())!=NULL) {
176 RefAlign* aln=NULL;
177 if (line[0]=='>') {
178 //establish current reference
179 char* ref_name=&line[1];
180 char* p=endSpToken(ref_name);
181 if (p==NULL) GError("Error parsing reference sequence name at:\n%s\n",line);
182 if (ref_prefix!=NULL) {
183 if (startsWith(ref_name,ref_prefix)) {
184 skipRefContig=false;
185 }
186 else {
187 skipRefContig=true;
188 goto NEXT_LINE_NOCHANGE;
189 }
190 }
191 *p=0;
192 if (seqs.Find(ref_name))
193 GError("Error (line %d): reference sequence %s is not unique in the input data\n",
194 rlineno+1,ref_name);
195 p++;
196 if (!parseInt(p,ref_numseqs))
197 GError("Error parsing the number of components for reference sequence %s\n",ref_name);
198 if (!parseInt(p,ref_lend))
199 GError("Error parsing the left end of reference sequence %s\n",ref_name);
200 if (!parseInt(p,ref_rend))
201 GError("Error parsing the right end of reference sequence %s\n",ref_name);
202 ref_len=ref_rend;
203 refseq=new GASeq(ref_name, 0, ref_len,
204 ref_lend-1, ref_len-ref_rend, 0);
205 refseq->setFlag(flag_IS_REF);
206 seqs.Add(ref_name,refseq);
207 //may be followed by actual nucleotide sequence of this reference
208 while (*p!=0 && *p!='\n') {
209 if (*p!='\t' && *p!=' ') refseq->extendSeq(*p);
210 p++;
211 }
212 if (refseq->len>0) {
213 refseq->endSeq();
214 if (refseq->len!=refseq->seqlen)
215 GError("Error: reference sequence %s length mismatch "
216 "(declared %d, found %d)\n",
217 refseq->id, refseq->seqlen, refseq->len);
218 }
219 goto NEXT_LINE_NOCHANGE;
220 } //reference line
221 //-- component/read line --
222 if (skipRefContig) goto NEXT_LINE_NOCHANGE;
223 else {
224 //else:
225 //-------------------------------------------
226 aln=new RefAlign(line, linebuf->length(), rlineno+1);
227 if (strcmp(aln->seqname, refseq->id)==0)
228 goto NEXT_LINE_NOCHANGE; //skip redundant inclusion of reference as its own child
229 if (seqs.Find(aln->seqname))
230 GError("Error (line %d): component sequence %s must be unique in the input data\n",
231 rlineno+1,aln->seqname);
232 if (clipmax>0) {
233 //check if any of the clipping involved is larger than clipmax
234 int maxovh=(clipmax<1.00)?
235 iround(clipmax * (float)aln->seqlen) :
236 (int)clipmax;
237 if (aln->clip3>maxovh || aln->clip5>maxovh) {
238 if (verbose)
239 fprintf(stderr, LOG_MSG_CLIPMAX,
240 aln->seqname, refseq->id, clipmax);
241 goto NEXT_LINE_NOCHANGE;
242 }// bad mismatching overhangs
243 }
244 GASeq* aseq=new GASeq(aln->seqname,aln->offset,aln->seqlen,
245 aln->clip5,aln->clip3,aln->reverse);
246 GASeq* rseq;
247 if (refseq->msa==NULL) {
248 rseq=refseq;
249 }
250 else rseq=new GASeq(refseq->id,0,refseq->seqlen,
251 refseq->clp5,refseq->clp3,0);
252 gappos=0;
253 while ((gaplen=aln->nextRefGap(gappos))>0)
254 rseq->setGap(gappos-1,gaplen);
255 gappos=0;
256 while ((gaplen=aln->nextSeqGap(gappos))>0)
257 aseq->setGap(gappos-1,gaplen);
258 //for mgblast alignment, only the query can be reversed
259 if (aseq->revcompl==1)
260 aseq->reverseGaps(); //don't update offset & reverse flags
261
262 GSeqAlign *newaln=new GSeqAlign(rseq, aseq);
263 if (rseq==refseq) {//first alignment with refseq
264 newaln->incOrd();
265 alns.Add(newaln);
266 goto NEXT_LINE;
267 }
268 refseq->msa->addAlign(refseq,newaln,rseq);
269 delete newaln;
270 seqs.Add(aseq->name(),aseq);
271 }// parsing new alignment
272 //---------------------
273 NEXT_LINE:
274 /* debug print the progressive alignment */
275 if (debugMode) {
276 for (int a=0;a<alns.Count();a++) {
277 printDebugAln(outf,alns.Get(a),a+1,cdbyank, refcdb);
278 }
279 }
280 NEXT_LINE_NOCHANGE:
281 //------------
282 delete aln;
283 if (linebuf->isEof()) break;
284 rlineno++;
285 //-------------
286 if (verbose) {
287 if (rlineno%1000==0) {
288 fprintf(stderr, "..%d", rlineno);
289 if (rlineno%8000==0) fprintf(stderr, "\n");
290 fflush(stderr);
291 }
292 }
293 } //----> the big loop of parsing input lines from the .lyt files
294 //*************** now build MSAs and write them
295 delete linebuf;
296 if (verbose) {
297 fprintf(stderr, "\n%d input lines processed.\n",rlineno);
298 fprintf(stderr, "Refining and printing %d MSA(s)..\n", alns.Count());
299 fflush(stderr);
300 }
301
302 //print all the alignments
303 for (int i=0;i<alns.Count();i++) {
304 GSeqAlign* a=alns.Get(i);
305 loadAlnSeqs(a,cdbyank, refcdb); //loading actual sequences for this cluster
306 if (debugMode) { //write plain text alignment file
307 fprintf(outf,">Alignment%d (%d)\n",i+1, a->Count());
308 a->print(outf, 'v');
309 }
310 else {//write a real ACE file
311 //a->buildMSA();
312 GStr ctgname;
313 ctgname.format("AorContig%d",i+1);
314 a->writeACE(outf, ctgname.chars());
315 a->freeMSA(); //free MSA and seq memory
316 }
317 } // for each PMSA cluster
318 // oooooooooo D O N E oooooooooooo
319 alns.Clear();
320 seqs.Clear();
321 fflush(outf);
322 if (outf!=stdout) fclose(outf);
323 if (inf!=stdin) fclose(inf);
324 delete cdbyank;
325 delete refcdb;
326 //GFREE(ref_prefix);
327 //GMessage("*** all done ***\n");
328 #ifdef __WIN32__
329 //getc(stdin);
330 #endif
331 }
332
333 //---------------------- RefAlign class
334 void RefAlign::parseErr(int fldno) {
335 fprintf(stderr, "Error parsing input line #%d (field %d):\n%s\n",
336 lineno, fldno, linecpy);
337 exit(3);
338 }
339
340 RefAlign::RefAlign(char* line, int len, int lno) {
341 lineno=lno;
342 linecpy=Gstrdup(line);
343 seqname=line;
344 char* p=endSpToken(seqname);
345 if (p==NULL) parseErr(0);
346 *p=0;
347 p++;
348 skipSp(p);
349 if (*p!='-' && *p!='+') parseErr(1);
350 reverse=0;
351 if (*p=='-') reverse=1;
352 p++;
353 clip5=0;
354 clip3=0;
355 if (!parseInt(p, seqlen)) parseErr(2);
356 if (!parseInt(p, offset)) parseErr(3);
357 offset--;
358 skipSp(p);
359 if (isdigit(*p)) {//clipping follows
360 if (!parseInt(p, clip5)) parseErr(4);
361 if (!parseInt(p, clip3)) parseErr(5);
362 }
363 if (reverse!=0) Gswap(clip5, clip3);
364 skipSp(p);
365 gaps=NULL;
366 gaps_onref=NULL;
367 gpos=NULL;
368 gpos_onref=NULL;
369 if (*p==0) return;
370 while (*p!=0 && *p!='R') {
371 while (*p!=' ' && *p!='\t' && *p!=0) p++;
372 skipSp(p);
373 }
374 if (*p==0) return;
375 // it is R
376 p++;
377 if (*p!=':') parseErr(6); //invalid field
378 p++;
379 if (isdigit(*p)) {//should be a number here
380 gaps=p;gpos=p;
381 }
382 else if (*p!='/') parseErr(6);
383 while (*p!=0 && *p!=' ' && *p!='/' && *p!='\t') p++;
384 if (*p==0) return;
385 if (*p=='/') {
386 *p=0;//split here
387 p++;
388 gaps_onref=p;
389 gpos_onref=p;
390 }
391 }
392 RefAlign::~RefAlign() {
393 GFREE(linecpy); //that was just for error messages
394 }
395
396 int RefAlign::nextSeqGap(int& pos) {
397 int r=1;
398 if (gpos==NULL || *gpos==0) return 0;
399 if (!parseInt(gpos,pos)) parseErr(6);
400 if (pos<=0) parseErr(6);
401 if (*gpos=='+') { //gap length following
402 gpos++;
403 if (!parseInt(gpos,r)) parseErr(6);
404 if (r<=0) parseErr(6);
405 }
406 if (*gpos==',') gpos++;
407 return r;
408 }
409
410 int RefAlign::nextRefGap(int& pos) {
411 int r=1;
412 if (gpos_onref==NULL || *gpos_onref==0) return 0;
413 if (!parseInt(gpos_onref,pos)) parseErr(6);
414 if (pos<=0) parseErr(6);
415 if (*gpos_onref=='+') { //gap length following
416 gpos_onref++;
417 if (!parseInt(gpos_onref,r)) parseErr(6);
418 if (r<=0) parseErr(6);
419 }
420 if (*gpos_onref==',') gpos_onref++;
421 return r;
422 }
423
424 void printDebugAln(FILE* f, GSeqAlign* aln, int num, GCdbYank* cdbynk, GCdbYank* refcdb) {
425 fprintf(f,">[%d]DebugAlign%d (%d)\n",rlineno+1, num, aln->Count());
426 loadAlnSeqs(aln,cdbynk, refcdb);
427 aln->print(f,'=');
428 }
429
430 void loadAlnSeqs(GSeqAlign* aln, GCdbYank* cdbynk, GCdbYank* refcdb) {
431 //FastaSeq seq;
432 for (int i=0;i<aln->Count();i++) {
433 GASeq* s=aln->Get(i);
434 if (s->len==0) {//not loaded yet
435 GCdbYank* yankdb=(s->hasFlag(flag_IS_REF) && refcdb!=NULL) ? refcdb : cdbynk;
436 if (yankdb->getRecord(s->id, *s) <= 0)
437 GError("Error retrieving sequence %s from database %s!\n",
438 s->id, yankdb->getDbName());
439 if (s->seqlen!=s->len)
440 GError("Error: sequence %s length mismatch! Declared %d, retrieved %d\n",
441 s->id, s->seqlen, s->len);
442 s->allupper();
443 s->loadProcessing();
444 }
445 }
446 }