1 |
gpertea |
26 |
#include "gff_utils.h" |
2 |
|
|
|
3 |
|
|
extern bool verbose; |
4 |
|
|
extern bool debugMode; |
5 |
|
|
|
6 |
gpertea |
59 |
//bool debugState=false; |
7 |
|
|
|
8 |
gpertea |
26 |
void printFasta(FILE* f, GStr& defline, char* seq, int seqlen) { |
9 |
|
|
if (seq==NULL) return; |
10 |
|
|
int len=(seqlen>0)?seqlen:strlen(seq); |
11 |
|
|
if (len<=0) return; |
12 |
|
|
if (!defline.is_empty()) |
13 |
|
|
fprintf(f, ">%s\n",defline.chars()); |
14 |
|
|
int ilen=0; |
15 |
|
|
for (int i=0; i < len; i++, ilen++) { |
16 |
|
|
if (ilen == 70) { |
17 |
|
|
fputc('\n', f); |
18 |
|
|
ilen = 0; |
19 |
|
|
} |
20 |
|
|
putc(seq[i], f); |
21 |
|
|
} //for |
22 |
|
|
fputc('\n', f); |
23 |
|
|
} |
24 |
|
|
|
25 |
|
|
int qsearch_gloci(uint x, GList<GffLocus>& loci) { |
26 |
|
|
//binary search |
27 |
|
|
//do the simplest tests first: |
28 |
|
|
if (loci[0]->start>x) return 0; |
29 |
|
|
if (loci.Last()->start<x) return -1; |
30 |
|
|
uint istart=0; |
31 |
|
|
int i=0; |
32 |
|
|
int idx=-1; |
33 |
|
|
int maxh=loci.Count()-1; |
34 |
|
|
int l=0; |
35 |
|
|
int h = maxh; |
36 |
|
|
while (l <= h) { |
37 |
|
|
i = (l+h)>>1; |
38 |
|
|
istart=loci[i]->start; |
39 |
|
|
if (istart < x) l = i + 1; |
40 |
|
|
else { |
41 |
|
|
if (istart == x) { //found matching coordinate here |
42 |
|
|
idx=i; |
43 |
|
|
while (idx<=maxh && loci[idx]->start==x) { |
44 |
|
|
idx++; |
45 |
|
|
} |
46 |
|
|
return (idx>maxh) ? -1 : idx; |
47 |
|
|
} |
48 |
|
|
h = i - 1; |
49 |
|
|
} |
50 |
|
|
} //while |
51 |
|
|
idx = l; |
52 |
|
|
while (idx<=maxh && loci[idx]->start<=x) { |
53 |
|
|
idx++; |
54 |
|
|
} |
55 |
|
|
return (idx>maxh) ? -1 : idx; |
56 |
|
|
} |
57 |
|
|
|
58 |
|
|
int qsearch_rnas(uint x, GList<GffObj>& rnas) { |
59 |
|
|
//binary search |
60 |
|
|
//do the simplest tests first: |
61 |
|
|
if (rnas[0]->start>x) return 0; |
62 |
|
|
if (rnas.Last()->start<x) return -1; |
63 |
|
|
uint istart=0; |
64 |
|
|
int i=0; |
65 |
|
|
int idx=-1; |
66 |
|
|
int maxh=rnas.Count()-1; |
67 |
|
|
int l=0; |
68 |
|
|
int h = maxh; |
69 |
|
|
while (l <= h) { |
70 |
|
|
i = (l+h)>>1; |
71 |
|
|
istart=rnas[i]->start; |
72 |
|
|
if (istart < x) l = i + 1; |
73 |
|
|
else { |
74 |
|
|
if (istart == x) { //found matching coordinate here |
75 |
|
|
idx=i; |
76 |
|
|
while (idx<=maxh && rnas[idx]->start==x) { |
77 |
|
|
idx++; |
78 |
|
|
} |
79 |
|
|
return (idx>maxh) ? -1 : idx; |
80 |
|
|
} |
81 |
|
|
h = i - 1; |
82 |
|
|
} |
83 |
|
|
} //while |
84 |
|
|
idx = l; |
85 |
|
|
while (idx<=maxh && rnas[idx]->start<=x) { |
86 |
|
|
idx++; |
87 |
|
|
} |
88 |
|
|
return (idx>maxh) ? -1 : idx; |
89 |
|
|
} |
90 |
|
|
|
91 |
|
|
int cmpRedundant(GffObj& a, GffObj& b) { |
92 |
|
|
if (a.exons.Count()==b.exons.Count()) { |
93 |
|
|
if (a.covlen==b.covlen) { |
94 |
|
|
return strcmp(a.getID(), b.getID()); |
95 |
|
|
} |
96 |
|
|
else return (a.covlen>b.covlen)? 1 : -1; |
97 |
|
|
} |
98 |
|
|
else return (a.exons.Count()>b.exons.Count())? 1: -1; |
99 |
|
|
} |
100 |
|
|
|
101 |
|
|
|
102 |
|
|
bool tMatch(GffObj& a, GffObj& b) { |
103 |
|
|
//strict intron chain match, or single-exon perfect match |
104 |
|
|
int imax=a.exons.Count()-1; |
105 |
|
|
int jmax=b.exons.Count()-1; |
106 |
|
|
int ovlen=0; |
107 |
|
|
if (imax!=jmax) return false; //different number of introns |
108 |
|
|
|
109 |
|
|
if (imax==0) { //single-exon mRNAs |
110 |
|
|
//if (equnspl) { |
111 |
|
|
//fuzz match for single-exon transfrags: |
112 |
|
|
// it's a match if they overlap at least 80% of max len |
113 |
|
|
ovlen=a.exons[0]->overlapLen(b.exons[0]); |
114 |
|
|
int maxlen=GMAX(a.covlen,b.covlen); |
115 |
|
|
return (ovlen>=maxlen*0.8); |
116 |
|
|
/*} |
117 |
|
|
else { |
118 |
|
|
//only exact match |
119 |
|
|
ovlen=a.covlen; |
120 |
|
|
return (a.exons[0]->start==b.exons[0]->start && |
121 |
|
|
a.exons[0]->end==b.exons[0]->end); |
122 |
|
|
|
123 |
|
|
}*/ |
124 |
|
|
} |
125 |
|
|
//check intron overlaps |
126 |
|
|
ovlen=a.exons[0]->end-(GMAX(a.start,b.start))+1; |
127 |
|
|
ovlen+=(GMIN(a.end,b.end))-a.exons.Last()->start; |
128 |
|
|
for (int i=1;i<=imax;i++) { |
129 |
|
|
if (i<imax) ovlen+=a.exons[i]->len(); |
130 |
|
|
if ((a.exons[i-1]->end!=b.exons[i-1]->end) || |
131 |
|
|
(a.exons[i]->start!=b.exons[i]->start)) { |
132 |
|
|
return false; //intron mismatch |
133 |
|
|
} |
134 |
|
|
} |
135 |
|
|
return true; |
136 |
|
|
} |
137 |
|
|
|
138 |
|
|
|
139 |
|
|
bool unsplContained(GffObj& ti, GffObj& tj, bool fuzzSpan) { |
140 |
|
|
//returns true only if ti (which MUST be single-exon) is "almost" contained in any of tj's exons |
141 |
|
|
//but it does not cross any intron-exon boundary of tj |
142 |
|
|
int imax=ti.exons.Count()-1; |
143 |
|
|
int jmax=tj.exons.Count()-1; |
144 |
|
|
if (imax>0) GError("Error: bad unsplContained() call, 1st param must be single-exon transcript!\n"); |
145 |
|
|
int minovl = (int)(0.8 * ti.len()); //minimum overlap for fuzzSpan |
146 |
|
|
if (fuzzSpan) { |
147 |
|
|
for (int j=0;j<=jmax;j++) { |
148 |
|
|
//must NOT overlap the introns |
149 |
|
|
if ((j>0 && ti.start<tj.exons[j]->start) |
150 |
|
|
|| (j<jmax && ti.end>tj.exons[j]->end)) |
151 |
|
|
return false; |
152 |
|
|
if (ti.exons[0]->overlapLen(tj.exons[j])>=minovl) |
153 |
|
|
return true; |
154 |
|
|
} |
155 |
|
|
} else { |
156 |
|
|
for (int j=0;j<=jmax;j++) { |
157 |
|
|
//must NOT overlap the introns |
158 |
|
|
if ((j>0 && ti.start<tj.exons[j]->start) |
159 |
|
|
|| (j<jmax && ti.end>tj.exons[j]->end)) |
160 |
|
|
return false; |
161 |
|
|
//strict containment |
162 |
|
|
if (ti.end<=tj.exons[j]->end && ti.start>=tj.exons[j]->start) |
163 |
|
|
return true; |
164 |
|
|
} |
165 |
|
|
} |
166 |
|
|
return false; |
167 |
|
|
} |
168 |
|
|
|
169 |
|
|
GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons, bool fuzzSpan) { |
170 |
|
|
// matchAllIntrons==true: transcripts are considered "redundant" only if |
171 |
|
|
// they have the exact same number of introns and same splice sites (or none) |
172 |
|
|
// (single-exon transcripts can be also fully contained to be considered matching) |
173 |
|
|
// matchAllIntrons==false: an intron chain could be a subset of a "container" chain, |
174 |
|
|
// as long as no intron-exon boundaries are violated; also, a single-exon |
175 |
|
|
// transcript will be collapsed if it's contained in one of the exons of the other |
176 |
|
|
// fuzzSpan==false: the genomic span of one transcript must be contained in or equal with the genomic |
177 |
|
|
// span of the other |
178 |
|
|
// |
179 |
|
|
// fuzzSpan==true: then genomic spans of transcripts are no longer required to be fully contained |
180 |
|
|
// (i.e. they may extend each-other in opposite directions) |
181 |
|
|
|
182 |
|
|
//if redundancy is found, the "bigger" transcript is returned (otherwise NULL is returned) |
183 |
|
|
if (ti.start>=tj.end || tj.start>=ti.end || tj.strand!=ti.strand) return NULL; //no span overlap at all |
184 |
|
|
int imax=ti.exons.Count()-1; |
185 |
|
|
int jmax=tj.exons.Count()-1; |
186 |
|
|
GffObj* bigger=NULL; |
187 |
|
|
GffObj* smaller=NULL; |
188 |
|
|
if (matchAllIntrons) { |
189 |
|
|
if (imax!=jmax) return false; |
190 |
|
|
if (ti.covlen>tj.covlen) { |
191 |
|
|
bigger=&ti; |
192 |
|
|
if (!fuzzSpan && (ti.start>tj.start || ti.end<tj.end)) return NULL; |
193 |
|
|
} |
194 |
|
|
else { //ti.covlen<=tj.covlen |
195 |
|
|
bigger=&tj; |
196 |
|
|
if (!fuzzSpan && (tj.start>ti.start || tj.end<ti.end)) return NULL; |
197 |
|
|
} |
198 |
|
|
//check that all introns really match |
199 |
|
|
for (int i=0;i<imax;i++) { |
200 |
|
|
if (ti.exons[i]->end!=tj.exons[i]->end || |
201 |
|
|
ti.exons[i+1]->start!=tj.exons[i+1]->start) return NULL; |
202 |
|
|
} |
203 |
|
|
return bigger; |
204 |
|
|
} |
205 |
|
|
//--- matchAllIntrons==false: intron-chain containment is also considered redundancy |
206 |
|
|
int maxlen=0; |
207 |
|
|
int minlen=0; |
208 |
|
|
if (ti.covlen>tj.covlen) { |
209 |
|
|
if (tj.exons.Count()>ti.exons.Count()) { |
210 |
|
|
//exon count override |
211 |
|
|
bigger=&tj; |
212 |
|
|
smaller=&ti; |
213 |
|
|
} |
214 |
|
|
else { |
215 |
|
|
bigger=&ti; |
216 |
|
|
smaller=&tj; |
217 |
|
|
} |
218 |
|
|
maxlen=ti.covlen; |
219 |
|
|
minlen=tj.covlen; |
220 |
|
|
} |
221 |
|
|
else { //tj has more bases |
222 |
|
|
if (ti.exons.Count()>tj.exons.Count()) { |
223 |
|
|
//exon count override |
224 |
|
|
bigger=&ti; |
225 |
|
|
smaller=&tj; |
226 |
|
|
} |
227 |
|
|
else { |
228 |
|
|
bigger=&tj; |
229 |
|
|
smaller=&ti; |
230 |
|
|
} |
231 |
|
|
maxlen=tj.covlen; |
232 |
|
|
minlen=ti.covlen; |
233 |
|
|
} |
234 |
|
|
if (imax==0 && jmax==0) { |
235 |
|
|
//single-exon transcripts: if fuzzSpan, at least 80% of the shortest one must be overlapped by the other |
236 |
|
|
if (fuzzSpan) { |
237 |
|
|
return (ti.exons[0]->overlapLen(tj.exons[0])>=minlen*0.8) ? bigger : NULL; |
238 |
|
|
} |
239 |
|
|
else { |
240 |
|
|
return (smaller->start>=bigger->start && smaller->end<=bigger->end) ? bigger : NULL; |
241 |
|
|
} |
242 |
|
|
} |
243 |
|
|
//containment is also considered redundancy |
244 |
|
|
if (smaller->exons.Count()==1) { |
245 |
|
|
//check if this single exon is contained in any of tj exons |
246 |
|
|
//without violating any intron-exon boundaries |
247 |
|
|
return (unsplContained(*smaller, *bigger, fuzzSpan) ? bigger : NULL); |
248 |
|
|
} |
249 |
|
|
|
250 |
|
|
//--from here on: both are multi-exon transcripts, imax>0 && jmax>0 |
251 |
|
|
if (ti.exons[imax]->start<tj.exons[0]->end || |
252 |
|
|
tj.exons[jmax]->start<ti.exons[0]->end ) |
253 |
|
|
return NULL; //intron chains do not overlap at all |
254 |
|
|
|
255 |
|
|
|
256 |
|
|
//checking full intron chain containment |
257 |
|
|
uint eistart=0, eiend=0, ejstart=0, ejend=0; //exon boundaries |
258 |
|
|
int i=1; //exon idx to the right of the current intron of ti |
259 |
|
|
int j=1; //exon idx to the right of the current intron of tj |
260 |
|
|
//find the first intron overlap: |
261 |
|
|
while (i<=imax && j<=jmax) { |
262 |
|
|
eistart=ti.exons[i-1]->end; |
263 |
|
|
eiend=ti.exons[i]->start; |
264 |
|
|
ejstart=tj.exons[j-1]->end; |
265 |
|
|
ejend=tj.exons[j]->start; |
266 |
|
|
if (ejend<eistart) { j++; continue; } |
267 |
|
|
if (eiend<ejstart) { i++; continue; } |
268 |
|
|
//we found an intron overlap |
269 |
|
|
break; |
270 |
|
|
} |
271 |
|
|
if (!fuzzSpan && (bigger->start>smaller->start || bigger->end < smaller->end)) return NULL; |
272 |
|
|
if ((i>1 && j>1) || i>imax || j>jmax) { |
273 |
|
|
return NULL; //either no intron overlaps found at all |
274 |
|
|
//or it's not the first intron for at least one of the transcripts |
275 |
|
|
} |
276 |
|
|
if (eistart!=ejstart || eiend!=ejend) return NULL; //not an exact intron match |
277 |
|
|
if (j>i) { |
278 |
|
|
//i==1, ti's start must not conflict with the previous intron of tj |
279 |
|
|
if (ti.start<tj.exons[j-1]->start) return NULL; |
280 |
|
|
//so i's first intron starts AFTER j's first intron |
281 |
|
|
// then j must contain i, so i's last intron must end with or before j's last intron |
282 |
|
|
if (ti.exons[imax]->start>tj.exons[jmax]->start) return NULL; |
283 |
|
|
//comment out the line above if you just want "intron compatibility" (i.e. extension of intron chains ) |
284 |
|
|
} |
285 |
|
|
else if (i>j) { |
286 |
|
|
//j==1, tj's start must not conflict with the previous intron of ti |
287 |
|
|
if (tj.start<ti.exons[i-1]->start) return NULL; |
288 |
|
|
//so j's intron chain starts AFTER i's |
289 |
|
|
// then i must contain j, so j's last intron must end with or before j's last intron |
290 |
|
|
if (tj.exons[jmax]->start>ti.exons[imax]->start) return NULL; |
291 |
|
|
//comment out the line above for just "intronCompatible()" check (allowing extension of intron chain) |
292 |
|
|
} |
293 |
|
|
//now check if the rest of the introns overlap, in the same sequence |
294 |
|
|
i++; |
295 |
|
|
j++; |
296 |
|
|
while (i<=imax && j<=jmax) { |
297 |
|
|
if (ti.exons[i-1]->end!=tj.exons[j-1]->end || |
298 |
|
|
ti.exons[i]->start!=tj.exons[j]->start) return NULL; |
299 |
|
|
i++; |
300 |
|
|
j++; |
301 |
|
|
} |
302 |
|
|
i--; |
303 |
|
|
j--; |
304 |
|
|
if (i==imax && j<jmax) { |
305 |
|
|
// tj has more introns to the right, check if ti's end doesn't conflict with the current tj exon boundary |
306 |
|
|
if (ti.end>tj.exons[j]->end) return NULL; |
307 |
|
|
} |
308 |
|
|
else if (j==jmax && i<imax) { |
309 |
|
|
if (tj.end>ti.exons[i]->end) return NULL; |
310 |
|
|
} |
311 |
|
|
return bigger; |
312 |
|
|
} |
313 |
|
|
|
314 |
|
|
|
315 |
|
|
int gseqCmpName(const pointer p1, const pointer p2) { |
316 |
|
|
return strcmp(((GenomicSeqData*)p1)->gseq_name, ((GenomicSeqData*)p2)->gseq_name); |
317 |
|
|
} |
318 |
|
|
|
319 |
|
|
|
320 |
gpertea |
59 |
void printLocus(GffLocus* loc, const char* pre) { |
321 |
gpertea |
26 |
if (pre!=NULL) fprintf(stderr, "%s", pre); |
322 |
|
|
GMessage(" [%d-%d] : ", loc->start, loc->end); |
323 |
|
|
GMessage("%s",loc->rnas[0]->getID()); |
324 |
|
|
for (int i=1;i<loc->rnas.Count();i++) { |
325 |
|
|
GMessage(",%s",loc->rnas[i]->getID()); |
326 |
|
|
} |
327 |
|
|
GMessage("\n"); |
328 |
|
|
} |
329 |
|
|
|
330 |
gpertea |
48 |
void preserveContainedCDS(GffObj* t, GffObj* tfrom) { |
331 |
|
|
//transfer CDS info to the container t if it's a larger protein |
332 |
|
|
if (tfrom->CDstart==0) return; |
333 |
|
|
if (t->CDstart) { |
334 |
|
|
if (tfrom->CDstart<t->CDstart && tfrom->CDstart>=t->start) |
335 |
|
|
t->CDstart=tfrom->CDstart; |
336 |
|
|
if (tfrom->CDend>t->CDend && tfrom->CDend<=t->end) |
337 |
|
|
t->CDend=tfrom->CDend; |
338 |
|
|
} |
339 |
|
|
else { //no CDS info on container, just copy it from the contained |
340 |
gpertea |
56 |
t->addCDS(tfrom->CDstart, tfrom->CDend, tfrom->CDphase); |
341 |
gpertea |
48 |
} |
342 |
gpertea |
47 |
} |
343 |
|
|
|
344 |
gpertea |
59 |
void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster, bool collapseRedundant, |
345 |
|
|
bool matchAllIntrons, bool fuzzSpan) { |
346 |
gpertea |
26 |
GTData* tdata=new GTData(t); |
347 |
|
|
gdata->tdata.Add(tdata); |
348 |
|
|
int tidx=-1; |
349 |
gpertea |
59 |
/* |
350 |
|
|
if (debug) { |
351 |
|
|
GMessage(">>Placing transcript %s\n", t->getID()); |
352 |
|
|
debugState=true; |
353 |
|
|
} |
354 |
|
|
else debugState=false; |
355 |
|
|
*/ |
356 |
gpertea |
26 |
if (t->exons.Count()>0) |
357 |
|
|
tidx=gdata->rnas.Add(t); //added it in sorted order |
358 |
|
|
else { |
359 |
|
|
gdata->gfs.Add(t); |
360 |
|
|
return; //nothing to do with these non-transcript objects |
361 |
|
|
} |
362 |
|
|
if (!doCluster) return; |
363 |
|
|
if (gdata->loci.Count()==0) { |
364 |
|
|
gdata->loci.Add(new GffLocus(t)); |
365 |
|
|
//GMessage(" <<make it first locus %d-%d \n",t->start, t->end); |
366 |
|
|
return; |
367 |
|
|
} |
368 |
gpertea |
59 |
/* |
369 |
gpertea |
26 |
//DEBUG: show available loci: |
370 |
gpertea |
59 |
if (debug) { |
371 |
|
|
GMessage(" [%d loci already:\n", gdata->loci.Count()); |
372 |
|
|
for (int l=0;l<gdata->loci.Count();l++) { |
373 |
|
|
printLocus(gdata->loci[l]); |
374 |
|
|
} |
375 |
|
|
} |
376 |
|
|
*/ |
377 |
gpertea |
26 |
int nidx=qsearch_gloci(t->end, gdata->loci); //get index of nearest locus starting just ABOVE t->end |
378 |
|
|
//GMessage("\tlooking up end coord %d in gdata->loci.. (qsearch got nidx=%d)\n", t->end, nidx); |
379 |
|
|
if (nidx==0) { |
380 |
|
|
//cannot have any overlapping loci |
381 |
gpertea |
59 |
//if (debug) GMessage(" <<no ovls possible, create locus %d-%d \n",t->start, t->end); |
382 |
gpertea |
26 |
gdata->loci.Add(new GffLocus(t)); |
383 |
|
|
return; |
384 |
|
|
} |
385 |
|
|
if (nidx==-1) nidx=gdata->loci.Count();//all loci start below t->end |
386 |
|
|
int lfound=0; //count of parent loci |
387 |
|
|
GArray<int> mrgloci(false); |
388 |
|
|
GList<GffLocus> tloci(true); //candidate parent loci to adopt this |
389 |
gpertea |
59 |
//if (debug) GMessage("\tchecking all loci from %d to 0\n",nidx-1); |
390 |
gpertea |
26 |
for (int l=nidx-1;l>=0;l--) { |
391 |
|
|
GffLocus& loc=*(gdata->loci[l]); |
392 |
|
|
if (loc.strand!='.' && t->strand!='.'&& loc.strand!=t->strand) continue; |
393 |
|
|
if (t->start>loc.end) { |
394 |
|
|
if (t->start-loc.start>GFF_MAX_LOCUS) break; //give up already |
395 |
|
|
continue; |
396 |
|
|
} |
397 |
gpertea |
59 |
if (loc.start>t->end) { |
398 |
|
|
//this should never be the case if nidx was found correctly |
399 |
|
|
GMessage("Warning: qsearch_gloci found loc.start>t.end!(t=%s)\n", t->getID()); |
400 |
|
|
continue; |
401 |
|
|
} |
402 |
|
|
/* |
403 |
|
|
if (debug) { |
404 |
|
|
GMessage(" !range overlap found with locus "); |
405 |
|
|
printLocus(&loc); |
406 |
|
|
} |
407 |
|
|
*/ |
408 |
gpertea |
26 |
if (loc.add_RNA(t)) { |
409 |
|
|
//will add this transcript to loc |
410 |
|
|
lfound++; |
411 |
|
|
mrgloci.Add(l); |
412 |
|
|
if (collapseRedundant) { |
413 |
|
|
//compare to every single transcript in this locus |
414 |
|
|
for (int ti=0;ti<loc.rnas.Count();ti++) { |
415 |
|
|
if (loc.rnas[ti]==t) continue; |
416 |
|
|
GTData* odata=(GTData*)(loc.rnas[ti]->uptr); |
417 |
|
|
//GMessage(" ..redundant check vs overlapping transcript %s\n",loc.rnas[ti]->getID()); |
418 |
|
|
GffObj* container=NULL; |
419 |
|
|
if (odata->replaced_by==NULL && |
420 |
|
|
(container=redundantTranscripts(*t, *(loc.rnas[ti]), matchAllIntrons, fuzzSpan))!=NULL) { |
421 |
|
|
if (container==t) { |
422 |
|
|
odata->replaced_by=t; |
423 |
gpertea |
48 |
preserveContainedCDS(t, loc.rnas[ti]); |
424 |
gpertea |
26 |
} |
425 |
|
|
else { |
426 |
|
|
tdata->replaced_by=loc.rnas[ti]; |
427 |
gpertea |
48 |
preserveContainedCDS(loc.rnas[ti], t); |
428 |
gpertea |
26 |
} |
429 |
|
|
} |
430 |
|
|
}//for each transcript in the exon-overlapping locus |
431 |
|
|
} //if doCollapseRedundant |
432 |
|
|
} //overlapping locus |
433 |
gpertea |
59 |
} //for each existing locus |
434 |
gpertea |
26 |
if (lfound==0) { |
435 |
|
|
//overlapping loci not found, create a locus with only this mRNA |
436 |
gpertea |
59 |
/* if (debug) { |
437 |
|
|
GMessage(" overlapping locus not found, create locus %d-%d \n",t->start, t->end); |
438 |
|
|
} |
439 |
|
|
*/ |
440 |
gpertea |
26 |
int addidx=gdata->loci.Add(new GffLocus(t)); |
441 |
|
|
if (addidx<0) { |
442 |
gpertea |
59 |
//should never be the case! |
443 |
gpertea |
26 |
GMessage(" WARNING: new GffLocus(%s:%d-%d) not added!\n",t->getID(), t->start, t->end); |
444 |
|
|
} |
445 |
|
|
} |
446 |
gpertea |
59 |
else { //found at least one overlapping locus |
447 |
|
|
lfound--; |
448 |
|
|
int locidx=mrgloci[lfound]; |
449 |
|
|
GffLocus& loc=*(gdata->loci[locidx]); |
450 |
|
|
//last locus index found is also the smallest index |
451 |
|
|
if (lfound>0) { |
452 |
|
|
//more than one loci found parenting this mRNA, merge loci |
453 |
|
|
/* if (debug) |
454 |
|
|
GMessage(" merging %d loci \n",lfound); |
455 |
|
|
*/ |
456 |
gpertea |
26 |
for (int l=0;l<lfound;l++) { |
457 |
gpertea |
59 |
int mlidx=mrgloci[l]; |
458 |
|
|
loc.addMerge(*(gdata->loci[mlidx]), t); |
459 |
|
|
gdata->loci.Delete(mlidx); //highest indices first, so it's safe to remove |
460 |
gpertea |
26 |
} |
461 |
gpertea |
59 |
} |
462 |
|
|
int i=locidx; |
463 |
|
|
while (i>0 && loc.start<gdata->loci[i-1]->start) { |
464 |
|
|
//bubble down until it's in the proper order |
465 |
|
|
i--; |
466 |
|
|
gdata->loci.Swap(i,i+1); |
467 |
|
|
} |
468 |
|
|
}//found at least one overlapping locus |
469 |
gpertea |
26 |
} |
470 |
|
|
|
471 |
|
|
void collectLocusData(GList<GenomicSeqData>& ref_data) { |
472 |
|
|
int locus_num=0; |
473 |
|
|
for (int g=0;g<ref_data.Count();g++) { |
474 |
|
|
GenomicSeqData* gdata=ref_data[g]; |
475 |
|
|
for (int l=0;l<gdata->loci.Count();l++) { |
476 |
|
|
GffLocus& loc=*(gdata->loci[l]); |
477 |
|
|
GHash<int> gnames(true); //gene names in this locus |
478 |
|
|
GHash<int> geneids(true); //Entrez GeneID: numbers |
479 |
|
|
for (int i=0;i<loc.rnas.Count();i++) { |
480 |
|
|
GffObj& t=*(loc.rnas[i]); |
481 |
|
|
GStr gname(t.getGeneName()); |
482 |
|
|
if (!gname.is_empty()) { |
483 |
|
|
gname.upper(); |
484 |
|
|
int* prevg=gnames.Find(gname.chars()); |
485 |
|
|
if (prevg!=NULL) (*prevg)++; |
486 |
|
|
else gnames.Add(gname, new int(1)); |
487 |
|
|
} |
488 |
|
|
//parse GeneID xrefs, if any: |
489 |
|
|
GStr xrefs(t.getAttr("xrefs")); |
490 |
|
|
if (!xrefs.is_empty()) { |
491 |
|
|
xrefs.startTokenize(","); |
492 |
|
|
GStr token; |
493 |
|
|
while (xrefs.nextToken(token)) { |
494 |
|
|
token.upper(); |
495 |
|
|
if (token.startsWith("GENEID:")) { |
496 |
|
|
token.cut(0,token.index(':')+1); |
497 |
|
|
int* prevg=geneids.Find(token.chars()); |
498 |
|
|
if (prevg!=NULL) (*prevg)++; |
499 |
|
|
else geneids.Add(token, new int(1)); |
500 |
|
|
} |
501 |
|
|
} //for each xref |
502 |
|
|
} //xrefs parsing |
503 |
|
|
}//for each transcript |
504 |
|
|
locus_num++; |
505 |
|
|
loc.locus_num=locus_num; |
506 |
|
|
if (gnames.Count()>0) { //collect all gene names associated to this locus |
507 |
|
|
gnames.startIterate(); |
508 |
|
|
int* gfreq=NULL; |
509 |
|
|
char* key=NULL; |
510 |
|
|
while ((gfreq=gnames.NextData(key))!=NULL) { |
511 |
|
|
loc.gene_names.AddIfNew(new CGeneSym(key,*gfreq)); |
512 |
|
|
} |
513 |
|
|
} //added collected gene_names |
514 |
|
|
if (loc.gene_ids.Count()>0) { //collect all GeneIDs names associated to this locus |
515 |
|
|
geneids.startIterate(); |
516 |
|
|
int* gfreq=NULL; |
517 |
|
|
char* key=NULL; |
518 |
|
|
while ((gfreq=geneids.NextData(key))!=NULL) { |
519 |
|
|
loc.gene_ids.AddIfNew(new CGeneSym(key,*gfreq)); |
520 |
|
|
} |
521 |
|
|
} |
522 |
|
|
} //for each locus |
523 |
|
|
}//for each genomic sequence |
524 |
|
|
} |
525 |
|
|
|
526 |
|
|
|
527 |
|
|
void GffLoader::load(GList<GenomicSeqData>& seqdata, GFValidateFunc* gf_validate, |
528 |
|
|
bool doCluster, bool doCollapseRedundant, bool matchAllIntrons, bool fuzzSpan) { |
529 |
|
|
GffReader* gffr=new GffReader(f, this->transcriptsOnly, false); //not only mRNA features, not sorted |
530 |
|
|
gffr->showWarnings(this->showWarnings); |
531 |
|
|
// keepAttrs mergeCloseExons noExonAttr |
532 |
|
|
gffr->readAll(this->fullAttributes, this->mergeCloseExons, this->noExonAttrs); |
533 |
|
|
//int redundant=0; //redundant annotation discarded |
534 |
|
|
if (verbose) GMessage(" .. loaded %d genomic features from %s\n", gffr->gflst.Count(), fname.chars()); |
535 |
|
|
//int rna_deleted=0; |
536 |
|
|
//add to GenomicSeqData, adding to existing loci and identifying intron-chain duplicates |
537 |
|
|
for (int k=0;k<gffr->gflst.Count();k++) { |
538 |
|
|
GffObj* m=gffr->gflst[k]; |
539 |
|
|
if (strcmp(m->getFeatureName(), "locus")==0 && |
540 |
|
|
m->getAttr("transcripts")!=NULL) { |
541 |
|
|
continue; |
542 |
|
|
} |
543 |
|
|
|
544 |
|
|
char* rloc=m->getAttr("locus"); |
545 |
|
|
if (rloc!=NULL && startsWith(rloc, "RLOC_")) { |
546 |
|
|
m->removeAttr("locus", rloc); |
547 |
|
|
} |
548 |
|
|
if (m->exons.Count()==0 && m->children.Count()==0) { |
549 |
|
|
//a non-mRNA feature with no subfeatures |
550 |
|
|
//add a dummy exon just to have the generic exon checking work |
551 |
|
|
m->addExon(m->start,m->end); |
552 |
|
|
} |
553 |
|
|
GList<GffObj> gfadd(false,false); |
554 |
|
|
if (gf_validate!=NULL && !(*gf_validate)(m, &gfadd)) { |
555 |
|
|
continue; |
556 |
|
|
} |
557 |
|
|
m->isUsed(true); //so the gffreader won't destroy it |
558 |
|
|
int i=-1; |
559 |
|
|
GenomicSeqData f(m->gseq_id); |
560 |
|
|
GenomicSeqData* gdata=NULL; |
561 |
|
|
|
562 |
|
|
if (seqdata.Found(&f,i)) gdata=seqdata[i]; |
563 |
|
|
else { //entry not created yet for this genomic seq |
564 |
|
|
gdata=new GenomicSeqData(m->gseq_id); |
565 |
|
|
seqdata.Add(gdata); |
566 |
|
|
} |
567 |
|
|
for (int k=0;k<gfadd.Count();k++) { |
568 |
|
|
placeGf(gfadd[k], gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan); |
569 |
|
|
} |
570 |
|
|
placeGf(m, gdata, doCluster, doCollapseRedundant, matchAllIntrons, fuzzSpan); |
571 |
|
|
} //for each read gffObj |
572 |
|
|
//if (verbose) GMessage(" .. %d records from %s clustered into loci.\n", gffr->gflst.Count(), fname.chars()); |
573 |
|
|
if (f!=stdin) { fclose(f); f=NULL; } |
574 |
|
|
delete gffr; |
575 |
|
|
} |