1 |
#include <stdio.h> |
2 |
#include <ctype.h> |
3 |
#include <math.h> |
4 |
#include "GArgs.h" |
5 |
#include "GStr.h" |
6 |
#include "GHash.hh" |
7 |
#include "GList.hh" |
8 |
#include "GCdbYank.h" |
9 |
#include "GapAssem.h" |
10 |
#define USAGE "Usage:\n\ |
11 |
mblaor <nrcl_layouts_w_gapinfo> -d <fastadb.cidx> [-r <ref_db.cidx]\n\ |
12 |
[-o <outfile_ace>] [-c <clipmax[%]>] [-p <ref_prefix>]\n\ |
13 |
\n\ |
14 |
<nrcl_layouts_w_gapinfo> is the .lyt file with alignment information\n\ |
15 |
as produced by nrcl\n\ |
16 |
-d cdb index (created with cdbfasta) for a multi-fasta file containing\n\ |
17 |
the actual nucleotide sequences of the components (required)\n\ |
18 |
-r cdb index for a multi-fasta file with the reference sequences\n\ |
19 |
(only needed if they are not given in the input .lyt file nor\n\ |
20 |
in <fastadb>)\n\ |
21 |
-p only consider a nrcl cluster if the reference sequence name starts\n\ |
22 |
with <ref_prefix>\n\ |
23 |
-c maximum clipping allowed for component sequences when\n\ |
24 |
mapped onto a reference; if it ends with '%' then maximum\n\ |
25 |
clipping is estimated for each read as <clipmax> percent\n\ |
26 |
of its length\n\ |
27 |
-o the output acefile is written to <outfile.ace> instead of stdout\n\ |
28 |
-G do not remove consensus gaps (default is to edit sequences\n\ |
29 |
in order to remove gap-dominated columns in MSA)\n" |
30 |
|
31 |
#define LOG_MSG_CLIPMAX "Overlap between %s and reference %s rejected due to clipmax=%4.2f constraint.\n" |
32 |
#define LOG_MSG_OVLCLIP "Overlap between %s and %s invalidated by the \ |
33 |
%dnt clipping of %s at %d' end.\n" |
34 |
//-------- global variables |
35 |
bool debugMode=false; |
36 |
bool removeConsGaps=false; |
37 |
char* ref_prefix=NULL; |
38 |
bool verbose=false; |
39 |
int rlineno=0; |
40 |
|
41 |
static FILE* outf; |
42 |
static GHash<GASeq> seqs(false); |
43 |
static GList<GSeqAlign> alns(true, true,false); |
44 |
// sorted, free element, not unique |
45 |
|
46 |
// flag for reference status: |
47 |
static const unsigned char flag_IS_REF=0; |
48 |
|
49 |
//this is only for tree/transitive embedding: |
50 |
static const unsigned char flag_HAS_PARENT=1; |
51 |
|
52 |
float clipmax=0; |
53 |
//-------------------------------- |
54 |
class RefAlign { |
55 |
char* linecpy; |
56 |
int lineno; //input file line# |
57 |
char* gpos; |
58 |
char* gpos_onref; |
59 |
char* gaps;//char string with gaps in this read |
60 |
char* gaps_onref;//char string with gaps in reference |
61 |
public: |
62 |
char* seqname; //aligned read |
63 |
int seqlen; //length of this read |
64 |
int offset; //offset of this read relative to reference |
65 |
int clip5; |
66 |
int clip3; |
67 |
char reverse; //is this reversed? |
68 |
void parseErr(int fldno); |
69 |
RefAlign(char* line, int len, int lno); |
70 |
~RefAlign(); |
71 |
int nextRefGap(int& pos); |
72 |
int nextSeqGap(int& pos); |
73 |
}; |
74 |
|
75 |
void loadAlnSeqs(GSeqAlign* aln, GCdbYank* cdbynk, GCdbYank* refcdb=NULL); |
76 |
void printDebugAln(FILE* f, GSeqAlign* aln, int num, GCdbYank* cdbynk, GCdbYank* refcdb=NULL); |
77 |
|
78 |
//returns the end of a space delimited token |
79 |
void skipSp(char*& p) { |
80 |
while (*p==' ' || *p=='\t' || *p=='\n') p++; |
81 |
} |
82 |
char* endSpToken(char* str) { |
83 |
if (str==NULL || *str==0) return NULL; |
84 |
char* p=str; |
85 |
for (;*p!=0;p++) |
86 |
if (*p==' ' || *p=='\t' || *p=='\n') return p; |
87 |
return p; // *p is '\0' |
88 |
} |
89 |
|
90 |
//-- prepareMerge checks clipping and even when no clipmax is given, |
91 |
// adjusts clipping as appropriately |
92 |
|
93 |
//======================================================== |
94 |
//==================== main ===================== |
95 |
//======================================================== |
96 |
int main(int argc, char * const argv[]) { |
97 |
//GArgs args(argc, argv, "DGvd:o:c:"); |
98 |
GArgs args(argc, argv, "DGvd:p:r:o:c:"); |
99 |
int e; |
100 |
if ((e=args.isError())>0) |
101 |
GError("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
102 |
if (args.getOpt('h')!=NULL) GError("%s\n", USAGE); |
103 |
debugMode=(args.getOpt('D')!=NULL); |
104 |
removeConsGaps=(args.getOpt('G')==NULL); |
105 |
verbose=(args.getOpt('v')!=NULL); |
106 |
if (debugMode) verbose=true; |
107 |
MSAColumns::removeConsGaps=removeConsGaps; |
108 |
GStr infile; |
109 |
if (args.startNonOpt()) { |
110 |
infile=args.nextNonOpt(); |
111 |
//GMessage("Given file: %s\n",infile.chars()); |
112 |
} |
113 |
//== |
114 |
FILE* inf; |
115 |
if (!infile.is_empty()) { |
116 |
inf=fopen(infile, "r"); |
117 |
if (inf==NULL) |
118 |
GError("Cannot open input file %s!\n",infile.chars()); |
119 |
} |
120 |
else |
121 |
inf=stdin; |
122 |
GStr s=args.getOpt('c'); |
123 |
if (!s.is_empty()) { |
124 |
bool ispercent=(s[-1]=='%'); |
125 |
if (ispercent) s.trimR('%'); |
126 |
int c=s.asInt(); |
127 |
if (c<=0) GError("Error: invalid -c <clipmax> (%d) option provided (must be " |
128 |
"a positive integer)!\n",c); |
129 |
if (ispercent && c>99) |
130 |
GError("Error: invalid percent value (%d) for -c option " |
131 |
" (must be an integer between 1 and 99)!\n",c); |
132 |
if (ispercent) { |
133 |
clipmax = float(c)/100; |
134 |
int maxp=iround(100*clipmax); |
135 |
if (verbose) fprintf(stderr, |
136 |
"Percentual max clipping set to %d%%\n", |
137 |
maxp); |
138 |
} |
139 |
else { |
140 |
clipmax=c; |
141 |
if (verbose) fprintf(stderr, "Max clipping set to %d bases\n", |
142 |
c); |
143 |
} |
144 |
|
145 |
} //clipmax option |
146 |
ref_prefix=args.getOpt('p'); |
147 |
GStr outfile=args.getOpt('o'); |
148 |
if (!outfile.is_empty()) { |
149 |
outf=fopen(outfile, "w"); |
150 |
if (outf==NULL) |
151 |
GError("Cannot open file %s for writing!\n",outfile.chars()); |
152 |
} |
153 |
else outf=stdout; |
154 |
//************************************************ |
155 |
|
156 |
|
157 |
GStr dbidx=args.getOpt('d'); |
158 |
if (dbidx.is_empty()) |
159 |
GError("%sError: a cdb index of a fasta file must be provided!\n",USAGE); |
160 |
GCdbYank* cdbyank=new GCdbYank(dbidx.chars()); |
161 |
GCdbYank* refcdb = NULL; |
162 |
dbidx=args.getOpt('r'); |
163 |
if (!dbidx.is_empty()) |
164 |
refcdb=new GCdbYank(dbidx.chars()); |
165 |
|
166 |
GLineReader* linebuf=new GLineReader(inf); |
167 |
char* line; |
168 |
alns.setSorted(compareOrdnum); |
169 |
GASeq* refseq=NULL; // current reference sequence |
170 |
int ref_numseqs=0; |
171 |
int ref_len=0; |
172 |
int ref_lend=0, ref_rend=0; |
173 |
int gaplen,gappos; |
174 |
bool skipRefContig=false; |
175 |
while ((line=linebuf->getLine())!=NULL) { |
176 |
RefAlign* aln=NULL; |
177 |
if (line[0]=='>') { |
178 |
//establish current reference |
179 |
char* ref_name=&line[1]; |
180 |
char* p=endSpToken(ref_name); |
181 |
if (p==NULL) GError("Error parsing reference sequence name at:\n%s\n",line); |
182 |
if (ref_prefix!=NULL) { |
183 |
if (startsWith(ref_name,ref_prefix)) { |
184 |
skipRefContig=false; |
185 |
} |
186 |
else { |
187 |
skipRefContig=true; |
188 |
goto NEXT_LINE_NOCHANGE; |
189 |
} |
190 |
} |
191 |
*p=0; |
192 |
if (seqs.Find(ref_name)) |
193 |
GError("Error (line %d): reference sequence %s is not unique in the input data\n", |
194 |
rlineno+1,ref_name); |
195 |
p++; |
196 |
if (!parseInt(p,ref_numseqs)) |
197 |
GError("Error parsing the number of components for reference sequence %s\n",ref_name); |
198 |
if (!parseInt(p,ref_lend)) |
199 |
GError("Error parsing the left end of reference sequence %s\n",ref_name); |
200 |
if (!parseInt(p,ref_rend)) |
201 |
GError("Error parsing the right end of reference sequence %s\n",ref_name); |
202 |
ref_len=ref_rend; |
203 |
refseq=new GASeq(ref_name, 0, ref_len, |
204 |
ref_lend-1, ref_len-ref_rend, 0); |
205 |
refseq->setFlag(flag_IS_REF); |
206 |
seqs.Add(ref_name,refseq); |
207 |
//may be followed by actual nucleotide sequence of this reference |
208 |
while (*p!=0 && *p!='\n') { |
209 |
if (*p!='\t' && *p!=' ') refseq->extendSeq(*p); |
210 |
p++; |
211 |
} |
212 |
if (refseq->len>0) { |
213 |
refseq->endSeq(); |
214 |
if (refseq->len!=refseq->seqlen) |
215 |
GError("Error: reference sequence %s length mismatch " |
216 |
"(declared %d, found %d)\n", |
217 |
refseq->id, refseq->seqlen, refseq->len); |
218 |
} |
219 |
goto NEXT_LINE_NOCHANGE; |
220 |
} //reference line |
221 |
//-- component/read line -- |
222 |
if (skipRefContig) goto NEXT_LINE_NOCHANGE; |
223 |
else { |
224 |
//else: |
225 |
//------------------------------------------- |
226 |
aln=new RefAlign(line, linebuf->length(), rlineno+1); |
227 |
if (strcmp(aln->seqname, refseq->id)==0) |
228 |
goto NEXT_LINE_NOCHANGE; //skip redundant inclusion of reference as its own child |
229 |
if (seqs.Find(aln->seqname)) |
230 |
GError("Error (line %d): component sequence %s must be unique in the input data\n", |
231 |
rlineno+1,aln->seqname); |
232 |
if (clipmax>0) { |
233 |
//check if any of the clipping involved is larger than clipmax |
234 |
int maxovh=(clipmax<1.00)? |
235 |
iround(clipmax * (float)aln->seqlen) : |
236 |
(int)clipmax; |
237 |
if (aln->clip3>maxovh || aln->clip5>maxovh) { |
238 |
if (verbose) |
239 |
fprintf(stderr, LOG_MSG_CLIPMAX, |
240 |
aln->seqname, refseq->id, clipmax); |
241 |
goto NEXT_LINE_NOCHANGE; |
242 |
}// bad mismatching overhangs |
243 |
} |
244 |
GASeq* aseq=new GASeq(aln->seqname,aln->offset,aln->seqlen, |
245 |
aln->clip5,aln->clip3,aln->reverse); |
246 |
GASeq* rseq; |
247 |
if (refseq->msa==NULL) { |
248 |
rseq=refseq; |
249 |
} |
250 |
else rseq=new GASeq(refseq->id,0,refseq->seqlen, |
251 |
refseq->clp5,refseq->clp3,0); |
252 |
gappos=0; |
253 |
while ((gaplen=aln->nextRefGap(gappos))>0) |
254 |
rseq->setGap(gappos-1,gaplen); |
255 |
gappos=0; |
256 |
while ((gaplen=aln->nextSeqGap(gappos))>0) |
257 |
aseq->setGap(gappos-1,gaplen); |
258 |
//for mgblast alignment, only the query can be reversed |
259 |
if (aseq->revcompl==1) |
260 |
aseq->reverseGaps(); //don't update offset & reverse flags |
261 |
|
262 |
GSeqAlign *newaln=new GSeqAlign(rseq, aseq); |
263 |
if (rseq==refseq) {//first alignment with refseq |
264 |
newaln->incOrd(); |
265 |
alns.Add(newaln); |
266 |
goto NEXT_LINE; |
267 |
} |
268 |
refseq->msa->addAlign(refseq,newaln,rseq); |
269 |
delete newaln; |
270 |
seqs.Add(aseq->name(),aseq); |
271 |
}// parsing new alignment |
272 |
//--------------------- |
273 |
NEXT_LINE: |
274 |
/* debug print the progressive alignment */ |
275 |
if (debugMode) { |
276 |
for (int a=0;a<alns.Count();a++) { |
277 |
printDebugAln(outf,alns.Get(a),a+1,cdbyank, refcdb); |
278 |
} |
279 |
} |
280 |
NEXT_LINE_NOCHANGE: |
281 |
//------------ |
282 |
delete aln; |
283 |
if (linebuf->isEof()) break; |
284 |
rlineno++; |
285 |
//------------- |
286 |
if (verbose) { |
287 |
if (rlineno%1000==0) { |
288 |
fprintf(stderr, "..%d", rlineno); |
289 |
if (rlineno%8000==0) fprintf(stderr, "\n"); |
290 |
fflush(stderr); |
291 |
} |
292 |
} |
293 |
} //----> the big loop of parsing input lines from the .lyt files |
294 |
//*************** now build MSAs and write them |
295 |
delete linebuf; |
296 |
if (verbose) { |
297 |
fprintf(stderr, "\n%d input lines processed.\n",rlineno); |
298 |
fprintf(stderr, "Refining and printing %d MSA(s)..\n", alns.Count()); |
299 |
fflush(stderr); |
300 |
} |
301 |
|
302 |
//print all the alignments |
303 |
for (int i=0;i<alns.Count();i++) { |
304 |
GSeqAlign* a=alns.Get(i); |
305 |
loadAlnSeqs(a,cdbyank, refcdb); //loading actual sequences for this cluster |
306 |
if (debugMode) { //write plain text alignment file |
307 |
fprintf(outf,">Alignment%d (%d)\n",i+1, a->Count()); |
308 |
a->print(outf, 'v'); |
309 |
} |
310 |
else {//write a real ACE file |
311 |
//a->buildMSA(); |
312 |
GStr ctgname; |
313 |
ctgname.format("AorContig%d",i+1); |
314 |
a->writeACE(outf, ctgname.chars()); |
315 |
a->freeMSA(); //free MSA and seq memory |
316 |
} |
317 |
} // for each PMSA cluster |
318 |
// oooooooooo D O N E oooooooooooo |
319 |
alns.Clear(); |
320 |
seqs.Clear(); |
321 |
fflush(outf); |
322 |
if (outf!=stdout) fclose(outf); |
323 |
if (inf!=stdin) fclose(inf); |
324 |
delete cdbyank; |
325 |
delete refcdb; |
326 |
//GFREE(ref_prefix); |
327 |
//GMessage("*** all done ***\n"); |
328 |
#ifdef __WIN32__ |
329 |
//getc(stdin); |
330 |
#endif |
331 |
} |
332 |
|
333 |
//---------------------- RefAlign class |
334 |
void RefAlign::parseErr(int fldno) { |
335 |
fprintf(stderr, "Error parsing input line #%d (field %d):\n%s\n", |
336 |
lineno, fldno, linecpy); |
337 |
exit(3); |
338 |
} |
339 |
|
340 |
RefAlign::RefAlign(char* line, int len, int lno) { |
341 |
lineno=lno; |
342 |
linecpy=Gstrdup(line); |
343 |
seqname=line; |
344 |
char* p=endSpToken(seqname); |
345 |
if (p==NULL) parseErr(0); |
346 |
*p=0; |
347 |
p++; |
348 |
skipSp(p); |
349 |
if (*p!='-' && *p!='+') parseErr(1); |
350 |
reverse=0; |
351 |
if (*p=='-') reverse=1; |
352 |
p++; |
353 |
clip5=0; |
354 |
clip3=0; |
355 |
if (!parseInt(p, seqlen)) parseErr(2); |
356 |
if (!parseInt(p, offset)) parseErr(3); |
357 |
offset--; |
358 |
skipSp(p); |
359 |
if (isdigit(*p)) {//clipping follows |
360 |
if (!parseInt(p, clip5)) parseErr(4); |
361 |
if (!parseInt(p, clip3)) parseErr(5); |
362 |
} |
363 |
if (reverse!=0) Gswap(clip5, clip3); |
364 |
skipSp(p); |
365 |
gaps=NULL; |
366 |
gaps_onref=NULL; |
367 |
gpos=NULL; |
368 |
gpos_onref=NULL; |
369 |
if (*p==0) return; |
370 |
while (*p!=0 && *p!='R') { |
371 |
while (*p!=' ' && *p!='\t' && *p!=0) p++; |
372 |
skipSp(p); |
373 |
} |
374 |
if (*p==0) return; |
375 |
// it is R |
376 |
p++; |
377 |
if (*p!=':') parseErr(6); //invalid field |
378 |
p++; |
379 |
if (isdigit(*p)) {//should be a number here |
380 |
gaps=p;gpos=p; |
381 |
} |
382 |
else if (*p!='/') parseErr(6); |
383 |
while (*p!=0 && *p!=' ' && *p!='/' && *p!='\t') p++; |
384 |
if (*p==0) return; |
385 |
if (*p=='/') { |
386 |
*p=0;//split here |
387 |
p++; |
388 |
gaps_onref=p; |
389 |
gpos_onref=p; |
390 |
} |
391 |
} |
392 |
RefAlign::~RefAlign() { |
393 |
GFREE(linecpy); //that was just for error messages |
394 |
} |
395 |
|
396 |
int RefAlign::nextSeqGap(int& pos) { |
397 |
int r=1; |
398 |
if (gpos==NULL || *gpos==0) return 0; |
399 |
if (!parseInt(gpos,pos)) parseErr(6); |
400 |
if (pos<=0) parseErr(6); |
401 |
if (*gpos=='+') { //gap length following |
402 |
gpos++; |
403 |
if (!parseInt(gpos,r)) parseErr(6); |
404 |
if (r<=0) parseErr(6); |
405 |
} |
406 |
if (*gpos==',') gpos++; |
407 |
return r; |
408 |
} |
409 |
|
410 |
int RefAlign::nextRefGap(int& pos) { |
411 |
int r=1; |
412 |
if (gpos_onref==NULL || *gpos_onref==0) return 0; |
413 |
if (!parseInt(gpos_onref,pos)) parseErr(6); |
414 |
if (pos<=0) parseErr(6); |
415 |
if (*gpos_onref=='+') { //gap length following |
416 |
gpos_onref++; |
417 |
if (!parseInt(gpos_onref,r)) parseErr(6); |
418 |
if (r<=0) parseErr(6); |
419 |
} |
420 |
if (*gpos_onref==',') gpos_onref++; |
421 |
return r; |
422 |
} |
423 |
|
424 |
void printDebugAln(FILE* f, GSeqAlign* aln, int num, GCdbYank* cdbynk, GCdbYank* refcdb) { |
425 |
fprintf(f,">[%d]DebugAlign%d (%d)\n",rlineno+1, num, aln->Count()); |
426 |
loadAlnSeqs(aln,cdbynk, refcdb); |
427 |
aln->print(f,'='); |
428 |
} |
429 |
|
430 |
void loadAlnSeqs(GSeqAlign* aln, GCdbYank* cdbynk, GCdbYank* refcdb) { |
431 |
//FastaSeq seq; |
432 |
for (int i=0;i<aln->Count();i++) { |
433 |
GASeq* s=aln->Get(i); |
434 |
if (s->len==0) {//not loaded yet |
435 |
GCdbYank* yankdb=(s->hasFlag(flag_IS_REF) && refcdb!=NULL) ? refcdb : cdbynk; |
436 |
if (yankdb->getRecord(s->id, *s) <= 0) |
437 |
GError("Error retrieving sequence %s from database %s!\n", |
438 |
s->id, yankdb->getDbName()); |
439 |
if (s->seqlen!=s->len) |
440 |
GError("Error: sequence %s length mismatch! Declared %d, retrieved %d\n", |
441 |
s->id, s->seqlen, s->len); |
442 |
s->allupper(); |
443 |
s->loadProcessing(); |
444 |
} |
445 |
} |
446 |
} |