1 |
package proteinstructure; |
2 |
|
3 |
import java.io.FileReader; |
4 |
import java.io.BufferedReader; |
5 |
import java.io.FileNotFoundException; |
6 |
import java.io.IOException; |
7 |
import java.io.OutputStream; |
8 |
import java.util.Collection; |
9 |
import java.util.Iterator; |
10 |
import java.util.Set; |
11 |
import java.util.TreeMap; |
12 |
import java.util.regex.Matcher; |
13 |
import java.util.regex.Pattern; |
14 |
|
15 |
/** |
16 |
* Package: proteinstructure |
17 |
* Class: Alignment |
18 |
* Author: Henning Stehr, Jose Duarte, Lars Petzold |
19 |
* |
20 |
* A multiple protein sequence alignment. This class represents a set of |
21 |
* protein sequences which are globally aligned. |
22 |
* |
23 |
*/ |
24 |
public class Alignment { |
25 |
|
26 |
/*------------------------------ constants ------------------------------*/ |
27 |
|
28 |
private static final char GAPCHARACTER = '-'; |
29 |
private static final String PIRFORMAT = "PIR"; |
30 |
private static final String FASTAFORMAT = "FASTA"; |
31 |
private static final String FASTAHEADER_REGEX = "^>\\s*([a-zA-Z0-9_|\\-]+)"; |
32 |
private static final String FASTAHEADER_CHAR = ">"; |
33 |
|
34 |
/*--------------------------- member variables --------------------------*/ |
35 |
|
36 |
private TreeMap<String,String> sequences; |
37 |
|
38 |
private TreeMap<String,TreeMap<Integer,Integer>> mapAlign2Seq; // key is sequence tag, the values are maps of alignment serials to sequence serials |
39 |
private TreeMap<String,TreeMap<Integer,Integer>> mapSeq2Align; // key is sequence tag, the values are maps of sequence serials to alignment serials |
40 |
|
41 |
/*----------------------------- constructors ----------------------------*/ |
42 |
|
43 |
public Alignment() {} |
44 |
|
45 |
/** |
46 |
* Reads an alignment from a file in either FASTA or PIR format |
47 |
*/ |
48 |
public Alignment(String fileName, String format) throws IOException, FileNotFoundException, PirFileFormatError, FastaFileFormatError { |
49 |
if (format.equals(PIRFORMAT)){ |
50 |
readFilePIRFormat(fileName); |
51 |
} else if (format.equals(FASTAFORMAT)){ |
52 |
readFileFastaFormat(fileName); |
53 |
} |
54 |
|
55 |
// checking lenghts, i.e. checking we read correctly from file |
56 |
checkLengths(); |
57 |
// map sequence serials (starting at 1, no gaps) to alignment serials (starting at 0, possibly gaps) |
58 |
doMapping(); |
59 |
} |
60 |
|
61 |
/** |
62 |
* Creates a trivial alignment (i.e. without gaps) for the given sequences. The sequences have to have the same lengths. |
63 |
* @param sequence |
64 |
* @param numberOfCopies |
65 |
*/ |
66 |
public Alignment(TreeMap<String, String> sequences) throws AlignmentConstructionError { |
67 |
|
68 |
// check that sequences have the same length |
69 |
int length = sequences.get(sequences.firstKey()).length(); |
70 |
for(String seqTag: sequences.keySet()) { |
71 |
if(sequences.get(seqTag).length() != length) { |
72 |
throw new AlignmentConstructionError("Cannot create trivial alignment. Sequence lenghts are not the same."); |
73 |
} |
74 |
} |
75 |
|
76 |
this.sequences = sequences; |
77 |
doMapping(); |
78 |
|
79 |
} |
80 |
|
81 |
/** |
82 |
* Writes alignment to the given output stream. The output format |
83 |
* conforms to the FASTA format. |
84 |
* @param out the output stream to be printed to |
85 |
* @param lineLength the maximal line length, setting this to null |
86 |
* always results in 80 characters per line |
87 |
* @param alignedSeqs toggles the output of the aligned or ungapped |
88 |
* sequences |
89 |
* @throws IOException |
90 |
*/ |
91 |
public void writeFasta(OutputStream out, Integer lineLength, boolean alignedSeqs) throws IOException { |
92 |
int len = 80; |
93 |
String seq = ""; |
94 |
|
95 |
if( lineLength != null ) { |
96 |
len = lineLength; |
97 |
} |
98 |
|
99 |
for( String name : getTags() ) { |
100 |
seq = alignedSeqs ? getAlignedSequence(name) : getSequenceNoGaps(name); |
101 |
out.write('>'); |
102 |
out.write(name.getBytes()); |
103 |
out.write(System.getProperty("line.separator").getBytes()); |
104 |
for(int i=0; i<seq.length(); i+=len) { |
105 |
out.write(seq.substring(i, Math.min(i+len,seq.length())).getBytes()); |
106 |
out.write(System.getProperty("line.separator").getBytes()); |
107 |
} |
108 |
} |
109 |
} |
110 |
|
111 |
/*---------------------------- private methods --------------------------*/ |
112 |
|
113 |
private void doMapping() { |
114 |
this.mapAlign2Seq = new TreeMap<String, TreeMap<Integer,Integer>>(); |
115 |
this.mapSeq2Align = new TreeMap<String, TreeMap<Integer,Integer>>(); |
116 |
for (String seqTag:sequences.keySet()){ |
117 |
this.mapAlign2Seq.put(seqTag, new TreeMap<Integer,Integer>()); |
118 |
this.mapSeq2Align.put(seqTag, new TreeMap<Integer,Integer>()); |
119 |
} |
120 |
for (String seqTag:sequences.keySet()){ |
121 |
String seq = sequences.get(seqTag); |
122 |
int seqIndex = 1; |
123 |
for (int alignIndex=0;alignIndex<seq.length();alignIndex++){ |
124 |
if (seq.charAt(alignIndex)!=GAPCHARACTER) { |
125 |
mapAlign2Seq.get(seqTag).put(alignIndex,seqIndex); |
126 |
seqIndex++; |
127 |
} else { // for gaps we assing a -1 |
128 |
mapAlign2Seq.get(seqTag).put(alignIndex,-1); |
129 |
} |
130 |
} |
131 |
} |
132 |
for (String seqTag:sequences.keySet()){ |
133 |
for (int alignIndex:mapAlign2Seq.get(seqTag).keySet()){ |
134 |
int seqIndex = mapAlign2Seq.get(seqTag).get(alignIndex); |
135 |
if (seqIndex!=-1){ |
136 |
mapSeq2Align.get(seqTag).put(seqIndex,alignIndex); |
137 |
} |
138 |
} |
139 |
} |
140 |
|
141 |
} |
142 |
|
143 |
private void checkLengths() { |
144 |
// checking all read sequences have same length |
145 |
TreeMap<String,Integer> seqLengths = new TreeMap<String, Integer>(); |
146 |
for (String seqTag:sequences.keySet()){ |
147 |
seqLengths.put(seqTag,sequences.get(seqTag).length()); |
148 |
} |
149 |
int lastLength = 0; |
150 |
for (String seqTag:seqLengths.keySet()){ |
151 |
if (lastLength!=0 && seqLengths.get(seqTag)!=lastLength) { |
152 |
System.err.println("Warning: Some sequences in alignment have different lengths."); |
153 |
} |
154 |
lastLength = seqLengths.get(seqTag); |
155 |
} |
156 |
} |
157 |
|
158 |
private void readFilePIRFormat(String fileName) throws IOException, FileNotFoundException, PirFileFormatError { |
159 |
String nextLine = "", |
160 |
currentSeq = "", |
161 |
currentSeqTag = ""; |
162 |
boolean foundFastaHeader = false; |
163 |
int lineNum = 0; |
164 |
|
165 |
// open file |
166 |
|
167 |
BufferedReader fileIn = new BufferedReader(new FileReader(fileName)); |
168 |
|
169 |
// read file |
170 |
|
171 |
|
172 |
// otherwise initalize TreeMap of sequences and rewind file |
173 |
sequences = new TreeMap<String,String>(); |
174 |
fileIn.reset(); |
175 |
|
176 |
// read sequences |
177 |
while((nextLine = fileIn.readLine()) != null) { |
178 |
++lineNum; |
179 |
nextLine = nextLine.trim(); // remove whitespace |
180 |
if(nextLine.length() > 0) { // ignore empty lines |
181 |
if(nextLine.charAt(0) == '*') { // finish last sequence |
182 |
sequences.put(currentSeqTag, currentSeq); |
183 |
} else { |
184 |
Pattern p = Pattern.compile(FASTAHEADER_REGEX); |
185 |
Matcher m = p.matcher(nextLine); |
186 |
if (m.find()){ // start new sequence |
187 |
currentSeq = ""; |
188 |
currentSeqTag=m.group(1); |
189 |
foundFastaHeader = true; |
190 |
} else { |
191 |
currentSeq = currentSeq + nextLine; // read sequence |
192 |
} |
193 |
} |
194 |
} |
195 |
} // end while |
196 |
|
197 |
|
198 |
fileIn.close(); |
199 |
|
200 |
// if no fasta headers found, file format is wrong |
201 |
if(!foundFastaHeader) { |
202 |
throw new PirFileFormatError("File does not conform with Pir file format (could not detect any fasta header in the file).",fileName,(long)lineNum); |
203 |
} |
204 |
|
205 |
} |
206 |
|
207 |
private void readFileFastaFormat(String fileName) throws IOException, FileNotFoundException, FastaFileFormatError { |
208 |
String nextLine = "", |
209 |
currentSeq = "", |
210 |
lastSeqTag = ""; |
211 |
boolean foundFastaHeader = false; |
212 |
long lineNum = 0; |
213 |
|
214 |
// open file |
215 |
|
216 |
BufferedReader fileIn = new BufferedReader(new FileReader(fileName)); |
217 |
|
218 |
// read file |
219 |
|
220 |
// initalize TreeMap of sequences |
221 |
sequences = new TreeMap<String,String>(); |
222 |
|
223 |
// read sequences |
224 |
while((nextLine = fileIn.readLine()) != null) { |
225 |
++lineNum; |
226 |
nextLine = nextLine.trim(); // remove whitespace |
227 |
if(nextLine.length() > 0) { // ignore empty lines |
228 |
Pattern p = Pattern.compile(FASTAHEADER_REGEX); |
229 |
Matcher m = p.matcher(nextLine); |
230 |
if (m.find()){ |
231 |
if (!lastSeqTag.equals("")) { |
232 |
sequences.put(lastSeqTag,currentSeq); |
233 |
currentSeq = ""; |
234 |
} |
235 |
lastSeqTag=m.group(1); |
236 |
foundFastaHeader = true; |
237 |
} else { |
238 |
currentSeq += nextLine; |
239 |
} |
240 |
} |
241 |
} // end while |
242 |
// inserting last sequence |
243 |
sequences.put(lastSeqTag,currentSeq); |
244 |
|
245 |
fileIn.close(); |
246 |
|
247 |
// if no fasta headers found, file format is wrong |
248 |
if(!foundFastaHeader) { |
249 |
throw new FastaFileFormatError("File does not conform with Fasta file format (could not find any fasta header in the file).",fileName,lineNum); |
250 |
} |
251 |
|
252 |
} |
253 |
|
254 |
|
255 |
/*---------------------------- public methods ---------------------------*/ |
256 |
|
257 |
/** |
258 |
* Returns the gap character |
259 |
* @return The gap character |
260 |
*/ |
261 |
public static char getGapCharacter() { return GAPCHARACTER; } |
262 |
|
263 |
/** |
264 |
* Returns a TreeMap (keys: tags, values: sequences) with all sequences |
265 |
* @return |
266 |
*/ |
267 |
public TreeMap<String,String> getSequences() { return sequences; } |
268 |
|
269 |
/** |
270 |
* Returns the sequence (with gaps) given a sequence tag |
271 |
* @param seqTag |
272 |
* @return |
273 |
*/ |
274 |
public String getAlignedSequence(String seqTag) { return sequences.get(seqTag); } |
275 |
|
276 |
/** |
277 |
* Returns the length of the alignment (including gaps) |
278 |
* @return |
279 |
*/ |
280 |
public int getAlignmentLength() { return sequences.get(sequences.firstKey()).length(); } |
281 |
|
282 |
/** |
283 |
* Returns the total number of sequences in the alignment |
284 |
* @return |
285 |
*/ |
286 |
public int getNumberOfSequences() { return sequences.size(); } |
287 |
|
288 |
/** |
289 |
* Returns true if alignment contains the sequence identified by seqTag |
290 |
* @param seqTag |
291 |
* @return |
292 |
*/ |
293 |
public boolean hasTag(String seqTag){ |
294 |
return sequences.containsKey(seqTag); |
295 |
} |
296 |
|
297 |
/** |
298 |
* Returns all sequence tags in a Set<String> |
299 |
* @return |
300 |
*/ |
301 |
public Set<String> getTags(){ |
302 |
return sequences.keySet(); |
303 |
} |
304 |
|
305 |
/** |
306 |
* Returns sequence seqTag with no gaps |
307 |
* @param seqNumber |
308 |
* @return |
309 |
*/ |
310 |
public String getSequenceNoGaps(String seqTag){ |
311 |
String seq = ""; |
312 |
for (int i=0;i<getAlignmentLength();i++){ |
313 |
char letter = sequences.get(seqTag).charAt(i); |
314 |
if (letter!=GAPCHARACTER){ |
315 |
seq+=letter; |
316 |
} |
317 |
} |
318 |
return seq; |
319 |
} |
320 |
|
321 |
/** |
322 |
* Given the alignment index (starting at 0, with gaps), |
323 |
* returns the sequence index (starting at 1, no gaps) of sequence seqTag |
324 |
* Returns -1 if sequence at that position is a gap |
325 |
* @param seqTag |
326 |
* @param alignIndex |
327 |
* @return |
328 |
*/ |
329 |
public int al2seq(String seqTag, int alignIndex){ |
330 |
return mapAlign2Seq.get(seqTag).get(alignIndex); |
331 |
} |
332 |
|
333 |
/** |
334 |
* Given sequence index (starting at 1, no gaps) of sequence seqTag, |
335 |
* returns the alignment index (starting at 0, with gaps) |
336 |
* @param seqTag |
337 |
* @param seqIndex |
338 |
* @return the alignment index or -1 on error |
339 |
*/ |
340 |
public int seq2al(String seqTag, int seqIndex) { |
341 |
int ret = -1; |
342 |
TreeMap<Integer,Integer> map = mapSeq2Align.get(seqTag); |
343 |
if(map.containsKey(seqIndex)) { |
344 |
ret = map.get(seqIndex); |
345 |
} else { |
346 |
System.err.println("Sever error in seq2al: Index " + seqIndex + " out of bounds for sequence " + seqTag + " (max index=" + map.lastKey() + ")"); |
347 |
} |
348 |
return ret; |
349 |
} |
350 |
|
351 |
/** |
352 |
* Gets column alignIndex of the alignment as a String |
353 |
* @param alignIndex |
354 |
* @return |
355 |
*/ |
356 |
public String getColumn(int alignIndex){ |
357 |
String col=""; |
358 |
for (String seq:sequences.values()){ |
359 |
col+=seq.charAt(alignIndex); |
360 |
} |
361 |
return col; |
362 |
} |
363 |
|
364 |
/** |
365 |
* Writes alignment by columns in tab delimited format, |
366 |
* useful to import to MySQL |
367 |
* |
368 |
*/ |
369 |
public void writeTabDelimited(){ |
370 |
for (int alignIndex=0;alignIndex<getAlignmentLength();alignIndex++){ |
371 |
for (String seq:sequences.values()){ |
372 |
System.out.print(seq.charAt(alignIndex)+"\t"); |
373 |
} |
374 |
System.out.print((alignIndex+1)+"\t"); |
375 |
for (String seqTag:sequences.keySet()){ |
376 |
int seqIndex = al2seq(seqTag, alignIndex); |
377 |
if (seqIndex!=-1){ // everything not gaps |
378 |
System.out.print(seqIndex+"\t"); |
379 |
} else { // gaps |
380 |
System.out.print("\\N\t"); |
381 |
} |
382 |
} |
383 |
System.out.println(); |
384 |
} |
385 |
} |
386 |
|
387 |
/** |
388 |
* Prints the alignment in simple text format (without sequence tags) to stdout |
389 |
*/ |
390 |
public void printSimple() { |
391 |
for(String seqTag:sequences.keySet()) { |
392 |
System.out.println(getAlignedSequence(seqTag)); |
393 |
} |
394 |
} |
395 |
|
396 |
/** |
397 |
* Prints the alignment in fasta format to stdout |
398 |
*/ |
399 |
public void printFasta() { |
400 |
for(String seqTag:sequences.keySet()) { |
401 |
System.out.println(FASTAHEADER_CHAR + seqTag); |
402 |
System.out.println(getAlignedSequence(seqTag)); |
403 |
} |
404 |
} |
405 |
|
406 |
/** |
407 |
* Gets list of consecutive non-gapped sub-sequences (by means of an edge set). |
408 |
* Example (X-denote any valid atomic sequence component): |
409 |
* <p> |
410 |
* |
411 |
* The data:<br> |
412 |
* <code>s1: XXX---XXX-X--X</code><br> |
413 |
* <code>s2: XXX---XXXX-XXX</code><br> |
414 |
* <code>s3: --XXXX--XX-XXX</code><br> |
415 |
* <p> |
416 |
* |
417 |
* The function calls:<br> |
418 |
* <code>TreeMap m = new TreeMap();</code><br> |
419 |
* <code>m.put("s1","XXX---XXX-X--X");</code><br> |
420 |
* <code>m.put("s2","XXX---XXXX-XXX");</code><br> |
421 |
* <code>m.put("s3","--XXXX--XX-XXX");</code><br> |
422 |
* <code>Alignment ali = new Alignment(m);</code><br> |
423 |
* <code>String[] tagSet1 = new String[1];</code><br> |
424 |
* <code>String[] tagSet2 = new String[2];</code><br> |
425 |
* <code>tagSet1[0] = "s1";</code><br> |
426 |
* <code>tagSet2[0] = "s2";</code><br> |
427 |
* <code>tagSet2[1] = "s3";</code><br> |
428 |
* <code>System.out.println(ali.getConsecutiveChunks("s2",tagSet1));</code><br> |
429 |
* <code>System.out.println(ali.getConsecutiveChunks("s1",tagSet2));</code><br> |
430 |
* <p> |
431 |
* The output:<br> |
432 |
* <code>[0 6, 7 9]</code><br> |
433 |
* <code>[0 2, 3 5, 6 6, 7 7]</code><br> |
434 |
* |
435 |
* @param tag tag of the sequence of which the chunk list is be determined |
436 |
* @param projectionTags list of tags of sequences in the alignment whose |
437 |
* projection along with sequence named tag is to be used as projection |
438 |
* from the whole alignment. Note, that invoking this function with |
439 |
* {@link #getTags()} set to this parameter, considers the whole alignment |
440 |
* matrix. |
441 |
* |
442 |
* @return edge set representing the sequence of non-gapped sequence chunks, |
443 |
* where for each edge in the set Edge.i denotes the starting position and |
444 |
* Edge.j the ending position of a chunk, i.e., for a separated atomic |
445 |
* sequence component (e.g., between two gaps) this notion yields i == j. |
446 |
* The indexing respects the sequence indexing for this class, i.e., index |
447 |
* 1 corresponds to the first position in the sequence. |
448 |
* |
449 |
* @throws IndexOutOfBoundsException |
450 |
*/ |
451 |
public EdgeSet getMatchingBlocks(String tag, Collection<String> projectionTags, int begin, int end, int degOfConservation) |
452 |
throws IndexOutOfBoundsException { |
453 |
|
454 |
if( end > getAlignmentLength() ) { |
455 |
throw new IndexOutOfBoundsException("end position exceeds alignment length"); |
456 |
} |
457 |
|
458 |
/* |
459 |
* col - current alignment column |
460 |
* start - start column for the next chunk to be added |
461 |
* foundStart - flag set whenever a start position for the next chunk |
462 |
* to be added has been encountered |
463 |
* c - observed character in sequence 'tag' in column 'col' |
464 |
* limit - maximal number of tolerated gap characters at a certain |
465 |
* alignment column with respect to the sequences |
466 |
* referencened in 'projectionTags' |
467 |
* chunks - the list of consecutive chunks to be returned |
468 |
*/ |
469 |
int col = begin; |
470 |
int start = 0; |
471 |
char c = '-'; |
472 |
boolean foundStart = false; |
473 |
int limit = Math.max(projectionTags.size() - degOfConservation,0); |
474 |
EdgeSet chunks = new EdgeSet(); |
475 |
|
476 |
while( col<end ) { |
477 |
c = getAlignedSequence(tag).charAt(col); |
478 |
|
479 |
if( c == getGapCharacter() ) { |
480 |
if( foundStart ) { |
481 |
foundStart = false; |
482 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,Math.max(start, col-1)))); |
483 |
} |
484 |
} else { |
485 |
if( limit >= count(projectionTags,col,getGapCharacter()) ) { |
486 |
if( !foundStart ) { |
487 |
foundStart = true; |
488 |
start = col; |
489 |
} |
490 |
} else { |
491 |
if( foundStart ) { |
492 |
foundStart = false; |
493 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,col-1))); |
494 |
} |
495 |
} |
496 |
} |
497 |
++col; |
498 |
} |
499 |
|
500 |
if( foundStart ) { |
501 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,Math.max(start,col-1)))); |
502 |
} |
503 |
|
504 |
return chunks; |
505 |
} |
506 |
|
507 |
/** |
508 |
* |
509 |
* @param tag |
510 |
* @param projectionTags |
511 |
* @param positions |
512 |
* @param degOfConservation |
513 |
* @return The indexing respects the sequence indexing for this class, i.e., index 1 corresponds to the first position in the sequence. |
514 |
* @throws IndexOutOfBoundsException |
515 |
*/ |
516 |
public EdgeSet getMatchingBlocks(String tag, Collection<String> projectionTags, NodeSet positions, int degOfConservation) |
517 |
throws IndexOutOfBoundsException { |
518 |
|
519 |
/* |
520 |
* col - current alignment column |
521 |
* prevCol - previous alignment column |
522 |
* start - start column for the next chunk to be added |
523 |
* foundStart - flag set whenever a start position for the next chunk |
524 |
* to be added has been encountered |
525 |
* c - observed character in sequence 'tag' in column 'col' |
526 |
* limit - maximal number of tolerated gap characters at a certain |
527 |
* alignment column with respect to the sequences |
528 |
* referencened in 'projectionTags' |
529 |
* chunks - the list of consecutive chunks to be returned |
530 |
*/ |
531 |
int col = positions.iterator().next().num; |
532 |
int prevCol = 0; |
533 |
int start = 0; |
534 |
boolean foundStart = false; |
535 |
char c = '-'; |
536 |
int limit = Math.max(projectionTags.size() - degOfConservation,0); |
537 |
EdgeSet chunks = new EdgeSet(); |
538 |
|
539 |
for(Iterator<Node> it = positions.iterator(); it.hasNext(); ) { |
540 |
prevCol = col; |
541 |
col = it.next().num; |
542 |
c = getAlignedSequence(tag).charAt(col); |
543 |
|
544 |
if( c == getGapCharacter() ) { |
545 |
if( foundStart ) { |
546 |
// complete chunk |
547 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,prevCol))); |
548 |
foundStart = false; |
549 |
} |
550 |
} else if ( limit >= count(projectionTags,col,getGapCharacter()) ) { |
551 |
if( foundStart ) { |
552 |
if( col - prevCol > 1 ) { |
553 |
// we allow the in between positions only to consist |
554 |
// of gap characters. otherwise we have to complete the |
555 |
// current chunk as the in-between non-gap positions |
556 |
// are not contained in 'positions' |
557 |
if( isBlockOf(tag,prevCol,col,getGapCharacter()) ) { |
558 |
for( String t : projectionTags) { |
559 |
if( !isBlockOf(t,prevCol,col,getGapCharacter()) ) { |
560 |
foundStart = false; |
561 |
break; |
562 |
} |
563 |
} |
564 |
} else { |
565 |
foundStart = false; |
566 |
} |
567 |
|
568 |
// please note that the 'foundStart' variable is |
569 |
// abused in the preceding if-clause to avoid the |
570 |
// allocation of an additional boolean |
571 |
if( !foundStart ) { |
572 |
// complete chunk |
573 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,prevCol))); |
574 |
foundStart = true; |
575 |
start = col; |
576 |
} |
577 |
} // else: current chunk can easily be extended |
578 |
} else { |
579 |
foundStart = true; |
580 |
start = col; |
581 |
} |
582 |
} else { |
583 |
if( foundStart ) { |
584 |
foundStart = false; |
585 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,prevCol))); |
586 |
} |
587 |
} |
588 |
} |
589 |
|
590 |
if( foundStart ) { |
591 |
// complete last chunk |
592 |
chunks.add(new Edge(al2seq(tag,start),al2seq(tag,col))); |
593 |
} |
594 |
|
595 |
return chunks; |
596 |
} |
597 |
|
598 |
/** |
599 |
* Extracts from the set of given alignment position those without gaps. |
600 |
* @param projectionTags tags of the sequences to be considered |
601 |
* @param positions alignment positions, i.e., indices of some columns |
602 |
* @param extractInPlace enable this flag to directly delete all nodes |
603 |
* pointing to "non-gapless" columns positions, set this parameter to |
604 |
* false to return a new node set, i.e., 'positions' remains unchanged! |
605 |
* @return a node set corresponding to indices of alignment columns out of |
606 |
* the set of considered columns ('positions'). Please note, that |
607 |
* parameter 'extractInPlace' has an immense impact on the output |
608 |
* generating. |
609 |
*/ |
610 |
public NodeSet getGaplessColumns(Collection<String> projectionTags, NodeSet positions, boolean extractInPlace) { |
611 |
|
612 |
// this node set will be filled and returned if the in place editing of |
613 |
// parameter 'positions' is disabled |
614 |
NodeSet output = null; |
615 |
if( !extractInPlace ) { |
616 |
output = new NodeSet(); |
617 |
} |
618 |
|
619 |
int numProjTags = projectionTags.size(); |
620 |
int col = 0; |
621 |
|
622 |
for( Iterator<Node> it = positions.iterator(); it.hasNext(); ) { |
623 |
col = it.next().num; |
624 |
if( numProjTags != count(projectionTags, col, getGapCharacter()) ) { |
625 |
// this column contains at least one gap |
626 |
if( extractInPlace ) { |
627 |
// remove corresponding item in 'positions' |
628 |
it.remove(); |
629 |
} |
630 |
} else if( !extractInPlace ) { |
631 |
// gapless column found -> record this event in 'output' (as |
632 |
// 'positions' is not editable) |
633 |
output.add(new Node(col)); |
634 |
} |
635 |
} |
636 |
|
637 |
// return the correct node set |
638 |
if( extractInPlace ) { |
639 |
return positions; |
640 |
} else { |
641 |
return output; |
642 |
} |
643 |
} |
644 |
|
645 |
/** |
646 |
* Counts the number of occurrences of the given character at the given |
647 |
* alignment column. The sequences to be considered is limited to the |
648 |
* given collection of alignment tags. |
649 |
* @param tags tags of the sequences to be considered |
650 |
* @param col |
651 |
* @param c |
652 |
* @return |
653 |
*/ |
654 |
public int count(Collection<String> tags, int col, char c) throws IndexOutOfBoundsException { |
655 |
int i=0; |
656 |
for( String t : tags ) { |
657 |
if( getAlignedSequence(t).charAt(col) == c ) { |
658 |
++i; |
659 |
} |
660 |
} |
661 |
return i; |
662 |
} |
663 |
|
664 |
public boolean isBlockOf( String tag, int begin, int end, char c ) throws IndexOutOfBoundsException { |
665 |
for(int i=begin; i<end; ++i) { |
666 |
if( getAlignedSequence(tag).charAt(i) != c ) { |
667 |
return false; |
668 |
} |
669 |
} |
670 |
return true; |
671 |
} |
672 |
|
673 |
/** to test the class */ |
674 |
public static void main(String[] args) throws FileNotFoundException, IOException { |
675 |
if (args.length<1){ |
676 |
System.err.println("Must provide FASTA file name as argument"); |
677 |
System.exit(1); |
678 |
} |
679 |
String fileName=args[0]; |
680 |
|
681 |
try { |
682 |
Alignment al = new Alignment(fileName,"FASTA"); |
683 |
|
684 |
|
685 |
// print columns |
686 |
//for (int i=0;i<al.getSequenceLength();i++){ |
687 |
// System.out.println(al.getColumn(i)); |
688 |
//} |
689 |
// print all sequences tags and sequences |
690 |
for (String seqTag:al.getSequences().keySet()){ |
691 |
System.out.println(seqTag); |
692 |
System.out.println(al.getAlignedSequence(seqTag)); |
693 |
} |
694 |
// test of al2seq |
695 |
//for (int i=0;i<al.getSequenceLength();i++) { |
696 |
// System.out.println("alignment serial: "+i+", seq serial: "+al.al2seq(al.sequences.firstKey(),i)); |
697 |
//} |
698 |
// test of seq2al |
699 |
//for (int serial=1;serial<=al.getSequenceNoGaps(al.sequences.firstKey()).length();serial++){ |
700 |
// System.out.println("seq serial: "+serial+", alignment serial: "+al.seq2al(al.sequences.firstKey(), serial)); |
701 |
//} |
702 |
// print alignment by columns tab delimited |
703 |
//al.writeTabDelimited(); |
704 |
} catch(Exception e) { |
705 |
System.err.println(e.getMessage()); |
706 |
System.exit(-1); |
707 |
} |
708 |
} |
709 |
|
710 |
} |