ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Alignment.java
Revision: 399
Committed: Mon Nov 12 15:56:00 2007 UTC (17 years, 3 months ago) by stehr
File size: 22867 byte(s)
Log Message:
added class PairwiseSequenceAlignment
Line File contents
1 package proteinstructure;
2
3 import java.io.FileReader;
4 import java.io.BufferedReader;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.OutputStream;
8 import java.util.Collection;
9 import java.util.Iterator;
10 import java.util.Set;
11 import java.util.TreeMap;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 /**
16 * Package: proteinstructure
17 * Class: Alignment
18 * Author: Henning Stehr, Jose Duarte, Lars Petzold
19 *
20 * A multiple protein sequence alignment. This class represents a set of
21 * protein sequences which are globally aligned.
22 *
23 */
24 public class Alignment {
25
26 /*------------------------------ constants ------------------------------*/
27
28 private static final char GAPCHARACTER = '-';
29 private static final String PIRFORMAT = "PIR";
30 private static final String FASTAFORMAT = "FASTA";
31 private static final String FASTAHEADER_REGEX = "^>\\s*([a-zA-Z0-9_|\\-]+)";
32 private static final String FASTAHEADER_CHAR = ">";
33
34 /*--------------------------- member variables --------------------------*/
35
36 private TreeMap<String,String> sequences;
37
38 private TreeMap<String,TreeMap<Integer,Integer>> mapAlign2Seq; // key is sequence tag, the values are maps of alignment serials to sequence serials
39 private TreeMap<String,TreeMap<Integer,Integer>> mapSeq2Align; // key is sequence tag, the values are maps of sequence serials to alignment serials
40
41 /*----------------------------- constructors ----------------------------*/
42
43 public Alignment() {}
44
45 /**
46 * Reads an alignment from a file in either FASTA or PIR format
47 */
48 public Alignment(String fileName, String format) throws IOException, FileNotFoundException, PirFileFormatError, FastaFileFormatError {
49 if (format.equals(PIRFORMAT)){
50 readFilePIRFormat(fileName);
51 } else if (format.equals(FASTAFORMAT)){
52 readFileFastaFormat(fileName);
53 }
54
55 // checking lenghts, i.e. checking we read correctly from file
56 checkLengths();
57 // map sequence serials (starting at 1, no gaps) to alignment serials (starting at 0, possibly gaps)
58 doMapping();
59 }
60
61 /**
62 * Creates a trivial alignment (i.e. without gaps) for the given sequences. The sequences have to have the same lengths.
63 * @param sequence
64 * @param numberOfCopies
65 */
66 public Alignment(TreeMap<String, String> sequences) throws AlignmentConstructionError {
67
68 // check that sequences have the same length
69 int length = sequences.get(sequences.firstKey()).length();
70 for(String seqTag: sequences.keySet()) {
71 if(sequences.get(seqTag).length() != length) {
72 throw new AlignmentConstructionError("Cannot create trivial alignment. Sequence lenghts are not the same.");
73 }
74 }
75
76 this.sequences = sequences;
77 doMapping();
78
79 }
80
81 /**
82 * Writes alignment to the given output stream. The output format
83 * conforms to the FASTA format.
84 * @param out the output stream to be printed to
85 * @param lineLength the maximal line length, setting this to null
86 * always results in 80 characters per line
87 * @param alignedSeqs toggles the output of the aligned or ungapped
88 * sequences
89 * @throws IOException
90 */
91 public void writeFasta(OutputStream out, Integer lineLength, boolean alignedSeqs) throws IOException {
92 int len = 80;
93 String seq = "";
94
95 if( lineLength != null ) {
96 len = lineLength;
97 }
98
99 for( String name : getTags() ) {
100 seq = alignedSeqs ? getAlignedSequence(name) : getSequenceNoGaps(name);
101 out.write('>');
102 out.write(name.getBytes());
103 out.write(System.getProperty("line.separator").getBytes());
104 for(int i=0; i<seq.length(); i+=len) {
105 out.write(seq.substring(i, Math.min(i+len,seq.length())).getBytes());
106 out.write(System.getProperty("line.separator").getBytes());
107 }
108 }
109 }
110
111 /*---------------------------- private methods --------------------------*/
112
113 private void doMapping() {
114 this.mapAlign2Seq = new TreeMap<String, TreeMap<Integer,Integer>>();
115 this.mapSeq2Align = new TreeMap<String, TreeMap<Integer,Integer>>();
116 for (String seqTag:sequences.keySet()){
117 this.mapAlign2Seq.put(seqTag, new TreeMap<Integer,Integer>());
118 this.mapSeq2Align.put(seqTag, new TreeMap<Integer,Integer>());
119 }
120 for (String seqTag:sequences.keySet()){
121 String seq = sequences.get(seqTag);
122 int seqIndex = 1;
123 for (int alignIndex=0;alignIndex<seq.length();alignIndex++){
124 if (seq.charAt(alignIndex)!=GAPCHARACTER) {
125 mapAlign2Seq.get(seqTag).put(alignIndex,seqIndex);
126 seqIndex++;
127 } else { // for gaps we assing a -1
128 mapAlign2Seq.get(seqTag).put(alignIndex,-1);
129 }
130 }
131 }
132 for (String seqTag:sequences.keySet()){
133 for (int alignIndex:mapAlign2Seq.get(seqTag).keySet()){
134 int seqIndex = mapAlign2Seq.get(seqTag).get(alignIndex);
135 if (seqIndex!=-1){
136 mapSeq2Align.get(seqTag).put(seqIndex,alignIndex);
137 }
138 }
139 }
140
141 }
142
143 private void checkLengths() {
144 // checking all read sequences have same length
145 TreeMap<String,Integer> seqLengths = new TreeMap<String, Integer>();
146 for (String seqTag:sequences.keySet()){
147 seqLengths.put(seqTag,sequences.get(seqTag).length());
148 }
149 int lastLength = 0;
150 for (String seqTag:seqLengths.keySet()){
151 if (lastLength!=0 && seqLengths.get(seqTag)!=lastLength) {
152 System.err.println("Warning: Some sequences in alignment have different lengths.");
153 }
154 lastLength = seqLengths.get(seqTag);
155 }
156 }
157
158 private void readFilePIRFormat(String fileName) throws IOException, FileNotFoundException, PirFileFormatError {
159 String nextLine = "",
160 currentSeq = "",
161 currentSeqTag = "";
162 boolean foundFastaHeader = false;
163 int lineNum = 0;
164
165 // open file
166
167 BufferedReader fileIn = new BufferedReader(new FileReader(fileName));
168
169 // read file
170
171
172 // otherwise initalize TreeMap of sequences and rewind file
173 sequences = new TreeMap<String,String>();
174 fileIn.reset();
175
176 // read sequences
177 while((nextLine = fileIn.readLine()) != null) {
178 ++lineNum;
179 nextLine = nextLine.trim(); // remove whitespace
180 if(nextLine.length() > 0) { // ignore empty lines
181 if(nextLine.charAt(0) == '*') { // finish last sequence
182 sequences.put(currentSeqTag, currentSeq);
183 } else {
184 Pattern p = Pattern.compile(FASTAHEADER_REGEX);
185 Matcher m = p.matcher(nextLine);
186 if (m.find()){ // start new sequence
187 currentSeq = "";
188 currentSeqTag=m.group(1);
189 foundFastaHeader = true;
190 } else {
191 currentSeq = currentSeq + nextLine; // read sequence
192 }
193 }
194 }
195 } // end while
196
197
198 fileIn.close();
199
200 // if no fasta headers found, file format is wrong
201 if(!foundFastaHeader) {
202 throw new PirFileFormatError("File does not conform with Pir file format (could not detect any fasta header in the file).",fileName,(long)lineNum);
203 }
204
205 }
206
207 private void readFileFastaFormat(String fileName) throws IOException, FileNotFoundException, FastaFileFormatError {
208 String nextLine = "",
209 currentSeq = "",
210 lastSeqTag = "";
211 boolean foundFastaHeader = false;
212 long lineNum = 0;
213
214 // open file
215
216 BufferedReader fileIn = new BufferedReader(new FileReader(fileName));
217
218 // read file
219
220 // initalize TreeMap of sequences
221 sequences = new TreeMap<String,String>();
222
223 // read sequences
224 while((nextLine = fileIn.readLine()) != null) {
225 ++lineNum;
226 nextLine = nextLine.trim(); // remove whitespace
227 if(nextLine.length() > 0) { // ignore empty lines
228 Pattern p = Pattern.compile(FASTAHEADER_REGEX);
229 Matcher m = p.matcher(nextLine);
230 if (m.find()){
231 if (!lastSeqTag.equals("")) {
232 sequences.put(lastSeqTag,currentSeq);
233 currentSeq = "";
234 }
235 lastSeqTag=m.group(1);
236 foundFastaHeader = true;
237 } else {
238 currentSeq += nextLine;
239 }
240 }
241 } // end while
242 // inserting last sequence
243 sequences.put(lastSeqTag,currentSeq);
244
245 fileIn.close();
246
247 // if no fasta headers found, file format is wrong
248 if(!foundFastaHeader) {
249 throw new FastaFileFormatError("File does not conform with Fasta file format (could not find any fasta header in the file).",fileName,lineNum);
250 }
251
252 }
253
254
255 /*---------------------------- public methods ---------------------------*/
256
257 /**
258 * Returns the gap character
259 * @return The gap character
260 */
261 public static char getGapCharacter() { return GAPCHARACTER; }
262
263 /**
264 * Returns a TreeMap (keys: tags, values: sequences) with all sequences
265 * @return
266 */
267 public TreeMap<String,String> getSequences() { return sequences; }
268
269 /**
270 * Returns the sequence (with gaps) given a sequence tag
271 * @param seqTag
272 * @return
273 */
274 public String getAlignedSequence(String seqTag) { return sequences.get(seqTag); }
275
276 /**
277 * Returns the length of the alignment (including gaps)
278 * @return
279 */
280 public int getAlignmentLength() { return sequences.get(sequences.firstKey()).length(); }
281
282 /**
283 * Returns the total number of sequences in the alignment
284 * @return
285 */
286 public int getNumberOfSequences() { return sequences.size(); }
287
288 /**
289 * Returns true if alignment contains the sequence identified by seqTag
290 * @param seqTag
291 * @return
292 */
293 public boolean hasTag(String seqTag){
294 return sequences.containsKey(seqTag);
295 }
296
297 /**
298 * Returns all sequence tags in a Set<String>
299 * @return
300 */
301 public Set<String> getTags(){
302 return sequences.keySet();
303 }
304
305 /**
306 * Returns sequence seqTag with no gaps
307 * @param seqNumber
308 * @return
309 */
310 public String getSequenceNoGaps(String seqTag){
311 String seq = "";
312 for (int i=0;i<getAlignmentLength();i++){
313 char letter = sequences.get(seqTag).charAt(i);
314 if (letter!=GAPCHARACTER){
315 seq+=letter;
316 }
317 }
318 return seq;
319 }
320
321 /**
322 * Given the alignment index (starting at 0, with gaps),
323 * returns the sequence index (starting at 1, no gaps) of sequence seqTag
324 * Returns -1 if sequence at that position is a gap
325 * @param seqTag
326 * @param alignIndex
327 * @return
328 */
329 public int al2seq(String seqTag, int alignIndex){
330 return mapAlign2Seq.get(seqTag).get(alignIndex);
331 }
332
333 /**
334 * Given sequence index (starting at 1, no gaps) of sequence seqTag,
335 * returns the alignment index (starting at 0, with gaps)
336 * @param seqTag
337 * @param seqIndex
338 * @return the alignment index or -1 on error
339 */
340 public int seq2al(String seqTag, int seqIndex) {
341 int ret = -1;
342 TreeMap<Integer,Integer> map = mapSeq2Align.get(seqTag);
343 if(map.containsKey(seqIndex)) {
344 ret = map.get(seqIndex);
345 } else {
346 System.err.println("Sever error in seq2al: Index " + seqIndex + " out of bounds for sequence " + seqTag + " (max index=" + map.lastKey() + ")");
347 }
348 return ret;
349 }
350
351 /**
352 * Gets column alignIndex of the alignment as a String
353 * @param alignIndex
354 * @return
355 */
356 public String getColumn(int alignIndex){
357 String col="";
358 for (String seq:sequences.values()){
359 col+=seq.charAt(alignIndex);
360 }
361 return col;
362 }
363
364 /**
365 * Writes alignment by columns in tab delimited format,
366 * useful to import to MySQL
367 *
368 */
369 public void writeTabDelimited(){
370 for (int alignIndex=0;alignIndex<getAlignmentLength();alignIndex++){
371 for (String seq:sequences.values()){
372 System.out.print(seq.charAt(alignIndex)+"\t");
373 }
374 System.out.print((alignIndex+1)+"\t");
375 for (String seqTag:sequences.keySet()){
376 int seqIndex = al2seq(seqTag, alignIndex);
377 if (seqIndex!=-1){ // everything not gaps
378 System.out.print(seqIndex+"\t");
379 } else { // gaps
380 System.out.print("\\N\t");
381 }
382 }
383 System.out.println();
384 }
385 }
386
387 /**
388 * Prints the alignment in simple text format (without sequence tags) to stdout
389 */
390 public void printSimple() {
391 for(String seqTag:sequences.keySet()) {
392 System.out.println(getAlignedSequence(seqTag));
393 }
394 }
395
396 /**
397 * Prints the alignment in fasta format to stdout
398 */
399 public void printFasta() {
400 for(String seqTag:sequences.keySet()) {
401 System.out.println(FASTAHEADER_CHAR + seqTag);
402 System.out.println(getAlignedSequence(seqTag));
403 }
404 }
405
406 /**
407 * Gets list of consecutive non-gapped sub-sequences (by means of an edge set).
408 * Example (X-denote any valid atomic sequence component):
409 * <p>
410 *
411 * The data:<br>
412 * <code>s1: XXX---XXX-X--X</code><br>
413 * <code>s2: XXX---XXXX-XXX</code><br>
414 * <code>s3: --XXXX--XX-XXX</code><br>
415 * <p>
416 *
417 * The function calls:<br>
418 * <code>TreeMap m = new TreeMap();</code><br>
419 * <code>m.put("s1","XXX---XXX-X--X");</code><br>
420 * <code>m.put("s2","XXX---XXXX-XXX");</code><br>
421 * <code>m.put("s3","--XXXX--XX-XXX");</code><br>
422 * <code>Alignment ali = new Alignment(m);</code><br>
423 * <code>String[] tagSet1 = new String[1];</code><br>
424 * <code>String[] tagSet2 = new String[2];</code><br>
425 * <code>tagSet1[0] = "s1";</code><br>
426 * <code>tagSet2[0] = "s2";</code><br>
427 * <code>tagSet2[1] = "s3";</code><br>
428 * <code>System.out.println(ali.getConsecutiveChunks("s2",tagSet1));</code><br>
429 * <code>System.out.println(ali.getConsecutiveChunks("s1",tagSet2));</code><br>
430 * <p>
431 * The output:<br>
432 * <code>[0 6, 7 9]</code><br>
433 * <code>[0 2, 3 5, 6 6, 7 7]</code><br>
434 *
435 * @param tag tag of the sequence of which the chunk list is be determined
436 * @param projectionTags list of tags of sequences in the alignment whose
437 * projection along with sequence named tag is to be used as projection
438 * from the whole alignment. Note, that invoking this function with
439 * {@link #getTags()} set to this parameter, considers the whole alignment
440 * matrix.
441 *
442 * @return edge set representing the sequence of non-gapped sequence chunks,
443 * where for each edge in the set Edge.i denotes the starting position and
444 * Edge.j the ending position of a chunk, i.e., for a separated atomic
445 * sequence component (e.g., between two gaps) this notion yields i == j.
446 * The indexing respects the sequence indexing for this class, i.e., index
447 * 1 corresponds to the first position in the sequence.
448 *
449 * @throws IndexOutOfBoundsException
450 */
451 public EdgeSet getMatchingBlocks(String tag, Collection<String> projectionTags, int begin, int end, int degOfConservation)
452 throws IndexOutOfBoundsException {
453
454 if( end > getAlignmentLength() ) {
455 throw new IndexOutOfBoundsException("end position exceeds alignment length");
456 }
457
458 /*
459 * col - current alignment column
460 * start - start column for the next chunk to be added
461 * foundStart - flag set whenever a start position for the next chunk
462 * to be added has been encountered
463 * c - observed character in sequence 'tag' in column 'col'
464 * limit - maximal number of tolerated gap characters at a certain
465 * alignment column with respect to the sequences
466 * referencened in 'projectionTags'
467 * chunks - the list of consecutive chunks to be returned
468 */
469 int col = begin;
470 int start = 0;
471 char c = '-';
472 boolean foundStart = false;
473 int limit = Math.max(projectionTags.size() - degOfConservation,0);
474 EdgeSet chunks = new EdgeSet();
475
476 while( col<end ) {
477 c = getAlignedSequence(tag).charAt(col);
478
479 if( c == getGapCharacter() ) {
480 if( foundStart ) {
481 foundStart = false;
482 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,Math.max(start, col-1))));
483 }
484 } else {
485 if( limit >= count(projectionTags,col,getGapCharacter()) ) {
486 if( !foundStart ) {
487 foundStart = true;
488 start = col;
489 }
490 } else {
491 if( foundStart ) {
492 foundStart = false;
493 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,col-1)));
494 }
495 }
496 }
497 ++col;
498 }
499
500 if( foundStart ) {
501 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,Math.max(start,col-1))));
502 }
503
504 return chunks;
505 }
506
507 /**
508 *
509 * @param tag
510 * @param projectionTags
511 * @param positions
512 * @param degOfConservation
513 * @return The indexing respects the sequence indexing for this class, i.e., index 1 corresponds to the first position in the sequence.
514 * @throws IndexOutOfBoundsException
515 */
516 public EdgeSet getMatchingBlocks(String tag, Collection<String> projectionTags, NodeSet positions, int degOfConservation)
517 throws IndexOutOfBoundsException {
518
519 /*
520 * col - current alignment column
521 * prevCol - previous alignment column
522 * start - start column for the next chunk to be added
523 * foundStart - flag set whenever a start position for the next chunk
524 * to be added has been encountered
525 * c - observed character in sequence 'tag' in column 'col'
526 * limit - maximal number of tolerated gap characters at a certain
527 * alignment column with respect to the sequences
528 * referencened in 'projectionTags'
529 * chunks - the list of consecutive chunks to be returned
530 */
531 int col = positions.iterator().next().num;
532 int prevCol = 0;
533 int start = 0;
534 boolean foundStart = false;
535 char c = '-';
536 int limit = Math.max(projectionTags.size() - degOfConservation,0);
537 EdgeSet chunks = new EdgeSet();
538
539 for(Iterator<Node> it = positions.iterator(); it.hasNext(); ) {
540 prevCol = col;
541 col = it.next().num;
542 c = getAlignedSequence(tag).charAt(col);
543
544 if( c == getGapCharacter() ) {
545 if( foundStart ) {
546 // complete chunk
547 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,prevCol)));
548 foundStart = false;
549 }
550 } else if ( limit >= count(projectionTags,col,getGapCharacter()) ) {
551 if( foundStart ) {
552 if( col - prevCol > 1 ) {
553 // we allow the in between positions only to consist
554 // of gap characters. otherwise we have to complete the
555 // current chunk as the in-between non-gap positions
556 // are not contained in 'positions'
557 if( isBlockOf(tag,prevCol,col,getGapCharacter()) ) {
558 for( String t : projectionTags) {
559 if( !isBlockOf(t,prevCol,col,getGapCharacter()) ) {
560 foundStart = false;
561 break;
562 }
563 }
564 } else {
565 foundStart = false;
566 }
567
568 // please note that the 'foundStart' variable is
569 // abused in the preceding if-clause to avoid the
570 // allocation of an additional boolean
571 if( !foundStart ) {
572 // complete chunk
573 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,prevCol)));
574 foundStart = true;
575 start = col;
576 }
577 } // else: current chunk can easily be extended
578 } else {
579 foundStart = true;
580 start = col;
581 }
582 } else {
583 if( foundStart ) {
584 foundStart = false;
585 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,prevCol)));
586 }
587 }
588 }
589
590 if( foundStart ) {
591 // complete last chunk
592 chunks.add(new Edge(al2seq(tag,start),al2seq(tag,col)));
593 }
594
595 return chunks;
596 }
597
598 /**
599 * Extracts from the set of given alignment position those without gaps.
600 * @param projectionTags tags of the sequences to be considered
601 * @param positions alignment positions, i.e., indices of some columns
602 * @param extractInPlace enable this flag to directly delete all nodes
603 * pointing to "non-gapless" columns positions, set this parameter to
604 * false to return a new node set, i.e., 'positions' remains unchanged!
605 * @return a node set corresponding to indices of alignment columns out of
606 * the set of considered columns ('positions'). Please note, that
607 * parameter 'extractInPlace' has an immense impact on the output
608 * generating.
609 */
610 public NodeSet getGaplessColumns(Collection<String> projectionTags, NodeSet positions, boolean extractInPlace) {
611
612 // this node set will be filled and returned if the in place editing of
613 // parameter 'positions' is disabled
614 NodeSet output = null;
615 if( !extractInPlace ) {
616 output = new NodeSet();
617 }
618
619 int numProjTags = projectionTags.size();
620 int col = 0;
621
622 for( Iterator<Node> it = positions.iterator(); it.hasNext(); ) {
623 col = it.next().num;
624 if( numProjTags != count(projectionTags, col, getGapCharacter()) ) {
625 // this column contains at least one gap
626 if( extractInPlace ) {
627 // remove corresponding item in 'positions'
628 it.remove();
629 }
630 } else if( !extractInPlace ) {
631 // gapless column found -> record this event in 'output' (as
632 // 'positions' is not editable)
633 output.add(new Node(col));
634 }
635 }
636
637 // return the correct node set
638 if( extractInPlace ) {
639 return positions;
640 } else {
641 return output;
642 }
643 }
644
645 /**
646 * Counts the number of occurrences of the given character at the given
647 * alignment column. The sequences to be considered is limited to the
648 * given collection of alignment tags.
649 * @param tags tags of the sequences to be considered
650 * @param col
651 * @param c
652 * @return
653 */
654 public int count(Collection<String> tags, int col, char c) throws IndexOutOfBoundsException {
655 int i=0;
656 for( String t : tags ) {
657 if( getAlignedSequence(t).charAt(col) == c ) {
658 ++i;
659 }
660 }
661 return i;
662 }
663
664 public boolean isBlockOf( String tag, int begin, int end, char c ) throws IndexOutOfBoundsException {
665 for(int i=begin; i<end; ++i) {
666 if( getAlignedSequence(tag).charAt(i) != c ) {
667 return false;
668 }
669 }
670 return true;
671 }
672
673 /** to test the class */
674 public static void main(String[] args) throws FileNotFoundException, IOException {
675 if (args.length<1){
676 System.err.println("Must provide FASTA file name as argument");
677 System.exit(1);
678 }
679 String fileName=args[0];
680
681 try {
682 Alignment al = new Alignment(fileName,"FASTA");
683
684
685 // print columns
686 //for (int i=0;i<al.getSequenceLength();i++){
687 // System.out.println(al.getColumn(i));
688 //}
689 // print all sequences tags and sequences
690 for (String seqTag:al.getSequences().keySet()){
691 System.out.println(seqTag);
692 System.out.println(al.getAlignedSequence(seqTag));
693 }
694 // test of al2seq
695 //for (int i=0;i<al.getSequenceLength();i++) {
696 // System.out.println("alignment serial: "+i+", seq serial: "+al.al2seq(al.sequences.firstKey(),i));
697 //}
698 // test of seq2al
699 //for (int serial=1;serial<=al.getSequenceNoGaps(al.sequences.firstKey()).length();serial++){
700 // System.out.println("seq serial: "+serial+", alignment serial: "+al.seq2al(al.sequences.firstKey(), serial));
701 //}
702 // print alignment by columns tab delimited
703 //al.writeTabDelimited();
704 } catch(Exception e) {
705 System.err.println(e.getMessage());
706 System.exit(-1);
707 }
708 }
709
710 }