ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Alignment.java
Revision: 245
Committed: Sat Aug 4 15:40:20 2007 UTC (17 years, 1 month ago) by duarte
File size: 10110 byte(s)
Log Message:
Added reverse map (from sequences serials to alignment serials) and did a bit of refactoring
Line User Rev File contents
1 duarte 244 package proteinstructure;
2    
3     import java.io.FileReader;
4     import java.io.BufferedReader;
5     import java.io.FileNotFoundException;
6     import java.io.IOException;
7     import java.util.regex.Matcher;
8     import java.util.regex.Pattern;
9    
10     /**
11     * Package: proteinstructure
12     * Class: Alignment
13     * Author: Henning Stehr, Jose Duarte
14     *
15     * A multiple protein sequence alignment. This class represents a set of
16     * protein sequences which are globally aligned.
17     *
18     */
19     public class Alignment {
20    
21     /*------------------------------ constants ------------------------------*/
22    
23     private static final char GAPCHARACTER = '-';
24     private static final String PIRFORMAT = "PIR";
25     private static final String FASTAFORMAT = "FASTA";
26    
27     /*--------------------------- member variables --------------------------*/
28    
29     String[] sequences;
30    
31     String[] sequenceTags;
32    
33 duarte 245 int[][] mapAlign2Seq; // first index is the sequence number, second is the alignment serials (indices)
34     int[][] mapSeq2Align;
35 duarte 244
36     /*----------------------------- constructors ----------------------------*/
37    
38     public Alignment(String fileName, String format) throws IOException, FileNotFoundException {
39     if (format.equals(PIRFORMAT)){
40     readFilePIRFormat(fileName);
41     } else if (format.equals(FASTAFORMAT)){
42     readFileFastaFormat(fileName);
43     }
44    
45     // map sequence serials (starting at 1, no gaps) to alignment serials (starting at 0, possibly gaps)
46     doMapping();
47     }
48    
49     /*---------------------------- private methods --------------------------*/
50    
51     private void doMapping() {
52 duarte 245 this.mapAlign2Seq = new int[getNumberOfSequences()][getSequenceLength()];
53     this.mapSeq2Align = new int[getNumberOfSequences()][getSequenceLength()];
54 duarte 244 for (int i=0;i<getNumberOfSequences();i++){
55     String seq = getSequence(i);
56     int serial = 1;
57     for (int j=0;j<seq.length();j++){
58     if (seq.charAt(j)!=GAPCHARACTER) {
59 duarte 245 mapAlign2Seq[i][j]=serial;
60 duarte 244 serial++;
61 duarte 245 } else { // for gaps we assing a -1
62     mapAlign2Seq[i][j]=-1;
63 duarte 244 }
64     }
65     }
66 duarte 245 for (int i=0;i<getNumberOfSequences();i++){
67     for (int j=0;j<getSequenceLength();j++){
68     int serial = mapAlign2Seq[i][j];
69     if (serial!=-1){
70     mapSeq2Align[i][serial]=j;
71     }
72     }
73     }
74 duarte 244
75     }
76    
77     private void readFilePIRFormat(String fileName) throws IOException, FileNotFoundException{
78     String nextLine = "",
79     currentSeq = "";
80     int numSeqs = 0,
81     currentSeqNum = 0;
82    
83     // open file
84    
85     BufferedReader fileIn = new BufferedReader(new FileReader(fileName));
86    
87     // read file
88    
89    
90     // count number of sequences in file, assume PIR format (sequences end with '\n*')
91     fileIn.mark(100000);
92     while((nextLine = fileIn.readLine()) != null) {
93     if(nextLine.length() > 0 && nextLine.charAt(0) == '>') {
94     numSeqs++;
95     }
96     }
97    
98     // if no fasta headers found, file format is wrong
99     if(numSeqs == 0) {
100     System.err.println("Error: " + fileName + " is not a "+PIRFORMAT+" file.");
101     System.exit(1);
102     //TODO throw exception
103     }
104    
105     // otherwise initalize array of sequences and rewind file
106     sequences = new String[numSeqs];
107     sequenceTags = new String[numSeqs];
108     fileIn.reset();
109    
110     // read sequences
111     while((nextLine = fileIn.readLine()) != null) {
112     nextLine = nextLine.trim(); // remove whitespace
113     if(nextLine.length() > 0) { // ignore empty lines
114     if(nextLine.charAt(0) == '*') { // finish last sequence
115     sequences[currentSeqNum] = currentSeq;
116     currentSeqNum++;
117     } else
118     if(nextLine.charAt(0) == '>') { // start new sequence
119     currentSeq = "";
120     Pattern p = Pattern.compile(">\\s*([a-zA-Z0-9_-]+)");
121     Matcher m = p.matcher(nextLine);
122     if (m.find()){
123     sequenceTags[currentSeqNum]=m.group(1);
124     }
125     } else {
126     currentSeq = currentSeq + nextLine; // read sequence
127     }
128     }
129     } // end while
130    
131     // verify number and lengths of sequences
132     if(currentSeqNum != numSeqs) {
133     System.err.println("Error: Could only read " + currentSeqNum + " out of "
134     + numSeqs + " sequences in file " + fileName);
135     System.exit(1);
136     //TODO throw exception
137     }
138    
139     for(currentSeqNum = 1; currentSeqNum < numSeqs; currentSeqNum++) {
140     if(sequences[currentSeqNum].length() != sequences[currentSeqNum-1].length()) {
141     System.err.println("Warning: Sequences " + (currentSeqNum-1) + " and " + currentSeqNum + " in alignment have different lengths.");
142     }
143     }
144    
145     // clean up
146     fileIn.close();
147     }
148    
149     private void readFileFastaFormat(String fileName) throws IOException, FileNotFoundException{
150     String nextLine = "",
151     currentSeq = "";
152     int numSeqs = 0,
153     currentSeqNum = 0;
154    
155     // open file
156    
157     BufferedReader fileIn = new BufferedReader(new FileReader(fileName));
158    
159     // read file
160    
161    
162     // count number of sequences in file
163     fileIn.mark(100000);
164     while((nextLine = fileIn.readLine()) != null) {
165     if(nextLine.length() > 0 && nextLine.charAt(0) == '>') {
166     numSeqs++;
167     }
168     }
169    
170     // if no fasta headers found, file format is wrong
171     if(numSeqs == 0) {
172     System.err.println("Error: " + fileName + " is not a "+FASTAFORMAT+" file.");
173     System.exit(1);
174     //TODO throw exception
175     }
176    
177     // otherwise initalize array of sequences and rewind file
178     sequences = new String[numSeqs];
179     sequenceTags = new String[numSeqs];
180     fileIn.reset();
181    
182     // read sequences
183     while((nextLine = fileIn.readLine()) != null) {
184     nextLine = nextLine.trim(); // remove whitespace
185     if(nextLine.length() > 0) { // ignore empty lines
186     if(nextLine.charAt(0) == '>') {
187     Pattern p = Pattern.compile(">\\s*([a-zA-Z0-9_-]+)");
188     Matcher m = p.matcher(nextLine);
189     if (m.find()){
190     sequenceTags[currentSeqNum]=m.group(1);
191     }
192     if (currentSeqNum!=0) {
193     sequences[currentSeqNum-1]=currentSeq;
194     currentSeq = "";
195     }
196     currentSeqNum++;
197     } else {
198     currentSeq += nextLine;
199     }
200     }
201     } // end while
202     // inserting last sequence
203     sequences[currentSeqNum-1]=currentSeq;
204    
205     // verify number and lengths of sequences
206     if(currentSeqNum != numSeqs) {
207     System.err.println("Error: Could only read " + currentSeqNum + " out of "
208     + numSeqs + " sequences in file " + fileName);
209     System.exit(1);
210     //TODO throw exception
211     }
212    
213     for(currentSeqNum = 1; currentSeqNum < numSeqs; currentSeqNum++) {
214     if(sequences[currentSeqNum].length() != sequences[currentSeqNum-1].length()) {
215     System.err.println("Warning: Sequences " + (currentSeqNum-1) + " and " + currentSeqNum + " in alignment have different lengths.");
216     }
217     }
218    
219     // clean up
220     fileIn.close();
221     }
222    
223    
224     /*---------------------------- public methods ---------------------------*/
225    
226     public char getGapCharacter() { return GAPCHARACTER; }
227     public String[] getSequences() { return sequences; }
228     public String getSequence(int i) { return sequences[i]; }
229     public int getSequenceLength() { return sequences[0].length(); }
230     //public int getSequenceLength(int i) { return sequences[i].length(); }
231     public int getNumberOfSequences() { return sequences.length; }
232    
233     /**
234 duarte 245 * Returns all sequence tags in a String[]
235     * @return
236     */
237     public String[] getTags(){
238     return sequenceTags;
239     }
240    
241     /**
242 duarte 244 * Get sequence's tag as present in fasta file
243     * @param i
244     * @return
245     */
246     public String getSequenceTag(int i){
247     return sequenceTags[i];
248     }
249    
250     /**
251 duarte 245 * Returns sequence seqNumber with no gaps
252     * @param seqNumber
253     * @return
254     */
255     public String getSequenceNoGaps(int seqNumber){
256     String seq = "";
257     for (int i=0;i<getSequenceLength();i++){
258     char letter = sequences[seqNumber].charAt(i);
259     if (letter!=GAPCHARACTER){
260     seq+=letter;
261     }
262     }
263     return seq;
264     }
265    
266     /**
267 duarte 244 * Given the alignment serial (starting at 0, with gaps),
268     * returns the sequence serial (starting at 1, no gaps) of sequence seqNumber
269 duarte 245 * Returns -1 if sequence at that position is a gap
270 duarte 244 * @param seqNumber
271     * @param alignmentSerial
272     * @return
273     */
274 duarte 245 public int al2seq(int seqNumber, int alignmentSerial){
275     return mapAlign2Seq[seqNumber][alignmentSerial];
276 duarte 244 }
277    
278     /**
279 duarte 245 * Given sequence serial of sequence seqNumber,
280     * returns the alignment serial
281     * @param seqNumber
282     * @param seqSerial
283     * @return
284     */
285     public int seq2al(int seqNumber, int seqSerial) {
286     return mapSeq2Align[seqNumber][seqSerial];
287     }
288    
289     /**
290 duarte 244 * Gets column i of the alignment as a String
291     * @param i
292     * @return
293     */
294     public String getColumn(int i){
295     String col="";
296     for (String seq:sequences){
297     col+=seq.charAt(i);
298     }
299     return col;
300     }
301    
302     /**
303     * Writes alignment by columns in tab delimited format,
304     * useful to import to MySQL
305     *
306     */
307     public void writeTabDelimited(){
308     for (int i=0;i<getSequenceLength();i++){
309     for (String seq:sequences){
310     System.out.print(seq.charAt(i)+"\t");
311     }
312     System.out.print((i+1)+"\t");
313     for (int seqNumber=0;seqNumber<getNumberOfSequences();seqNumber++){
314 duarte 245 int serial = al2seq(seqNumber, i);
315     if (serial!=-1){ // everything not gaps
316 duarte 244 System.out.print(serial+"\t");
317 duarte 245 } else { // gaps
318 duarte 244 System.out.print("\\N\t");
319     }
320     }
321     System.out.println();
322     }
323     }
324    
325     /** to test the class */
326     public static void main(String[] args) throws FileNotFoundException, IOException {
327     if (args.length<1){
328     System.err.println("Must provide FASTA file name as argument");
329     System.exit(1);
330     }
331     String fileName=args[0];
332     Alignment al = new Alignment(fileName,"FASTA");
333     // for (int i=0;i<al.getSequenceLength();i++){
334     // System.out.println(al.getColumn(i));
335     // }
336    
337 duarte 245 // for (int i=0;i<al.getNumberOfSequences();i++){
338     // System.out.println(al.getSequenceTag(i));
339     // System.out.println(al.getSequence(i));
340     // }
341     //for (int i=0;i<al.getSequenceLength();i++) {
342     //System.out.println("alignment serial: "+i+", seq serial: "+al.getSerialInSequence(0,i));
343     //}
344     for (int serial=1;serial<=al.getSequenceNoGaps(0).length();serial++){
345     System.out.println("seq serial: "+serial+", alignment serial: "+al.seq2al(0, serial));
346 duarte 244 }
347     //al.writeTabDelimited();
348     }
349    
350     }