1 |
duarte |
244 |
package proteinstructure; |
2 |
|
|
|
3 |
|
|
import java.io.FileReader; |
4 |
|
|
import java.io.BufferedReader; |
5 |
|
|
import java.io.FileNotFoundException; |
6 |
|
|
import java.io.IOException; |
7 |
|
|
import java.util.regex.Matcher; |
8 |
|
|
import java.util.regex.Pattern; |
9 |
|
|
|
10 |
|
|
/** |
11 |
|
|
* Package: proteinstructure |
12 |
|
|
* Class: Alignment |
13 |
|
|
* Author: Henning Stehr, Jose Duarte |
14 |
|
|
* |
15 |
|
|
* A multiple protein sequence alignment. This class represents a set of |
16 |
|
|
* protein sequences which are globally aligned. |
17 |
|
|
* |
18 |
|
|
*/ |
19 |
|
|
public class Alignment { |
20 |
|
|
|
21 |
|
|
/*------------------------------ constants ------------------------------*/ |
22 |
|
|
|
23 |
|
|
private static final char GAPCHARACTER = '-'; |
24 |
|
|
private static final String PIRFORMAT = "PIR"; |
25 |
|
|
private static final String FASTAFORMAT = "FASTA"; |
26 |
|
|
|
27 |
|
|
/*--------------------------- member variables --------------------------*/ |
28 |
|
|
|
29 |
|
|
String[] sequences; |
30 |
|
|
|
31 |
|
|
String[] sequenceTags; |
32 |
|
|
|
33 |
duarte |
245 |
int[][] mapAlign2Seq; // first index is the sequence number, second is the alignment serials (indices) |
34 |
|
|
int[][] mapSeq2Align; |
35 |
duarte |
244 |
|
36 |
|
|
/*----------------------------- constructors ----------------------------*/ |
37 |
|
|
|
38 |
|
|
public Alignment(String fileName, String format) throws IOException, FileNotFoundException { |
39 |
|
|
if (format.equals(PIRFORMAT)){ |
40 |
|
|
readFilePIRFormat(fileName); |
41 |
|
|
} else if (format.equals(FASTAFORMAT)){ |
42 |
|
|
readFileFastaFormat(fileName); |
43 |
|
|
} |
44 |
|
|
|
45 |
|
|
// map sequence serials (starting at 1, no gaps) to alignment serials (starting at 0, possibly gaps) |
46 |
|
|
doMapping(); |
47 |
|
|
} |
48 |
|
|
|
49 |
|
|
/*---------------------------- private methods --------------------------*/ |
50 |
|
|
|
51 |
|
|
private void doMapping() { |
52 |
duarte |
245 |
this.mapAlign2Seq = new int[getNumberOfSequences()][getSequenceLength()]; |
53 |
|
|
this.mapSeq2Align = new int[getNumberOfSequences()][getSequenceLength()]; |
54 |
duarte |
244 |
for (int i=0;i<getNumberOfSequences();i++){ |
55 |
|
|
String seq = getSequence(i); |
56 |
|
|
int serial = 1; |
57 |
|
|
for (int j=0;j<seq.length();j++){ |
58 |
|
|
if (seq.charAt(j)!=GAPCHARACTER) { |
59 |
duarte |
245 |
mapAlign2Seq[i][j]=serial; |
60 |
duarte |
244 |
serial++; |
61 |
duarte |
245 |
} else { // for gaps we assing a -1 |
62 |
|
|
mapAlign2Seq[i][j]=-1; |
63 |
duarte |
244 |
} |
64 |
|
|
} |
65 |
|
|
} |
66 |
duarte |
245 |
for (int i=0;i<getNumberOfSequences();i++){ |
67 |
|
|
for (int j=0;j<getSequenceLength();j++){ |
68 |
|
|
int serial = mapAlign2Seq[i][j]; |
69 |
|
|
if (serial!=-1){ |
70 |
|
|
mapSeq2Align[i][serial]=j; |
71 |
|
|
} |
72 |
|
|
} |
73 |
|
|
} |
74 |
duarte |
244 |
|
75 |
|
|
} |
76 |
|
|
|
77 |
|
|
private void readFilePIRFormat(String fileName) throws IOException, FileNotFoundException{ |
78 |
|
|
String nextLine = "", |
79 |
|
|
currentSeq = ""; |
80 |
|
|
int numSeqs = 0, |
81 |
|
|
currentSeqNum = 0; |
82 |
|
|
|
83 |
|
|
// open file |
84 |
|
|
|
85 |
|
|
BufferedReader fileIn = new BufferedReader(new FileReader(fileName)); |
86 |
|
|
|
87 |
|
|
// read file |
88 |
|
|
|
89 |
|
|
|
90 |
|
|
// count number of sequences in file, assume PIR format (sequences end with '\n*') |
91 |
|
|
fileIn.mark(100000); |
92 |
|
|
while((nextLine = fileIn.readLine()) != null) { |
93 |
|
|
if(nextLine.length() > 0 && nextLine.charAt(0) == '>') { |
94 |
|
|
numSeqs++; |
95 |
|
|
} |
96 |
|
|
} |
97 |
|
|
|
98 |
|
|
// if no fasta headers found, file format is wrong |
99 |
|
|
if(numSeqs == 0) { |
100 |
|
|
System.err.println("Error: " + fileName + " is not a "+PIRFORMAT+" file."); |
101 |
|
|
System.exit(1); |
102 |
|
|
//TODO throw exception |
103 |
|
|
} |
104 |
|
|
|
105 |
|
|
// otherwise initalize array of sequences and rewind file |
106 |
|
|
sequences = new String[numSeqs]; |
107 |
|
|
sequenceTags = new String[numSeqs]; |
108 |
|
|
fileIn.reset(); |
109 |
|
|
|
110 |
|
|
// read sequences |
111 |
|
|
while((nextLine = fileIn.readLine()) != null) { |
112 |
|
|
nextLine = nextLine.trim(); // remove whitespace |
113 |
|
|
if(nextLine.length() > 0) { // ignore empty lines |
114 |
|
|
if(nextLine.charAt(0) == '*') { // finish last sequence |
115 |
|
|
sequences[currentSeqNum] = currentSeq; |
116 |
|
|
currentSeqNum++; |
117 |
|
|
} else |
118 |
|
|
if(nextLine.charAt(0) == '>') { // start new sequence |
119 |
|
|
currentSeq = ""; |
120 |
|
|
Pattern p = Pattern.compile(">\\s*([a-zA-Z0-9_-]+)"); |
121 |
|
|
Matcher m = p.matcher(nextLine); |
122 |
|
|
if (m.find()){ |
123 |
|
|
sequenceTags[currentSeqNum]=m.group(1); |
124 |
|
|
} |
125 |
|
|
} else { |
126 |
|
|
currentSeq = currentSeq + nextLine; // read sequence |
127 |
|
|
} |
128 |
|
|
} |
129 |
|
|
} // end while |
130 |
|
|
|
131 |
|
|
// verify number and lengths of sequences |
132 |
|
|
if(currentSeqNum != numSeqs) { |
133 |
|
|
System.err.println("Error: Could only read " + currentSeqNum + " out of " |
134 |
|
|
+ numSeqs + " sequences in file " + fileName); |
135 |
|
|
System.exit(1); |
136 |
|
|
//TODO throw exception |
137 |
|
|
} |
138 |
|
|
|
139 |
|
|
for(currentSeqNum = 1; currentSeqNum < numSeqs; currentSeqNum++) { |
140 |
|
|
if(sequences[currentSeqNum].length() != sequences[currentSeqNum-1].length()) { |
141 |
|
|
System.err.println("Warning: Sequences " + (currentSeqNum-1) + " and " + currentSeqNum + " in alignment have different lengths."); |
142 |
|
|
} |
143 |
|
|
} |
144 |
|
|
|
145 |
|
|
// clean up |
146 |
|
|
fileIn.close(); |
147 |
|
|
} |
148 |
|
|
|
149 |
|
|
private void readFileFastaFormat(String fileName) throws IOException, FileNotFoundException{ |
150 |
|
|
String nextLine = "", |
151 |
|
|
currentSeq = ""; |
152 |
|
|
int numSeqs = 0, |
153 |
|
|
currentSeqNum = 0; |
154 |
|
|
|
155 |
|
|
// open file |
156 |
|
|
|
157 |
|
|
BufferedReader fileIn = new BufferedReader(new FileReader(fileName)); |
158 |
|
|
|
159 |
|
|
// read file |
160 |
|
|
|
161 |
|
|
|
162 |
|
|
// count number of sequences in file |
163 |
|
|
fileIn.mark(100000); |
164 |
|
|
while((nextLine = fileIn.readLine()) != null) { |
165 |
|
|
if(nextLine.length() > 0 && nextLine.charAt(0) == '>') { |
166 |
|
|
numSeqs++; |
167 |
|
|
} |
168 |
|
|
} |
169 |
|
|
|
170 |
|
|
// if no fasta headers found, file format is wrong |
171 |
|
|
if(numSeqs == 0) { |
172 |
|
|
System.err.println("Error: " + fileName + " is not a "+FASTAFORMAT+" file."); |
173 |
|
|
System.exit(1); |
174 |
|
|
//TODO throw exception |
175 |
|
|
} |
176 |
|
|
|
177 |
|
|
// otherwise initalize array of sequences and rewind file |
178 |
|
|
sequences = new String[numSeqs]; |
179 |
|
|
sequenceTags = new String[numSeqs]; |
180 |
|
|
fileIn.reset(); |
181 |
|
|
|
182 |
|
|
// read sequences |
183 |
|
|
while((nextLine = fileIn.readLine()) != null) { |
184 |
|
|
nextLine = nextLine.trim(); // remove whitespace |
185 |
|
|
if(nextLine.length() > 0) { // ignore empty lines |
186 |
|
|
if(nextLine.charAt(0) == '>') { |
187 |
|
|
Pattern p = Pattern.compile(">\\s*([a-zA-Z0-9_-]+)"); |
188 |
|
|
Matcher m = p.matcher(nextLine); |
189 |
|
|
if (m.find()){ |
190 |
|
|
sequenceTags[currentSeqNum]=m.group(1); |
191 |
|
|
} |
192 |
|
|
if (currentSeqNum!=0) { |
193 |
|
|
sequences[currentSeqNum-1]=currentSeq; |
194 |
|
|
currentSeq = ""; |
195 |
|
|
} |
196 |
|
|
currentSeqNum++; |
197 |
|
|
} else { |
198 |
|
|
currentSeq += nextLine; |
199 |
|
|
} |
200 |
|
|
} |
201 |
|
|
} // end while |
202 |
|
|
// inserting last sequence |
203 |
|
|
sequences[currentSeqNum-1]=currentSeq; |
204 |
|
|
|
205 |
|
|
// verify number and lengths of sequences |
206 |
|
|
if(currentSeqNum != numSeqs) { |
207 |
|
|
System.err.println("Error: Could only read " + currentSeqNum + " out of " |
208 |
|
|
+ numSeqs + " sequences in file " + fileName); |
209 |
|
|
System.exit(1); |
210 |
|
|
//TODO throw exception |
211 |
|
|
} |
212 |
|
|
|
213 |
|
|
for(currentSeqNum = 1; currentSeqNum < numSeqs; currentSeqNum++) { |
214 |
|
|
if(sequences[currentSeqNum].length() != sequences[currentSeqNum-1].length()) { |
215 |
|
|
System.err.println("Warning: Sequences " + (currentSeqNum-1) + " and " + currentSeqNum + " in alignment have different lengths."); |
216 |
|
|
} |
217 |
|
|
} |
218 |
|
|
|
219 |
|
|
// clean up |
220 |
|
|
fileIn.close(); |
221 |
|
|
} |
222 |
|
|
|
223 |
|
|
|
224 |
|
|
/*---------------------------- public methods ---------------------------*/ |
225 |
|
|
|
226 |
|
|
public char getGapCharacter() { return GAPCHARACTER; } |
227 |
|
|
public String[] getSequences() { return sequences; } |
228 |
|
|
public String getSequence(int i) { return sequences[i]; } |
229 |
|
|
public int getSequenceLength() { return sequences[0].length(); } |
230 |
|
|
//public int getSequenceLength(int i) { return sequences[i].length(); } |
231 |
|
|
public int getNumberOfSequences() { return sequences.length; } |
232 |
|
|
|
233 |
|
|
/** |
234 |
duarte |
245 |
* Returns all sequence tags in a String[] |
235 |
|
|
* @return |
236 |
|
|
*/ |
237 |
|
|
public String[] getTags(){ |
238 |
|
|
return sequenceTags; |
239 |
|
|
} |
240 |
|
|
|
241 |
|
|
/** |
242 |
duarte |
244 |
* Get sequence's tag as present in fasta file |
243 |
|
|
* @param i |
244 |
|
|
* @return |
245 |
|
|
*/ |
246 |
|
|
public String getSequenceTag(int i){ |
247 |
|
|
return sequenceTags[i]; |
248 |
|
|
} |
249 |
|
|
|
250 |
|
|
/** |
251 |
duarte |
245 |
* Returns sequence seqNumber with no gaps |
252 |
|
|
* @param seqNumber |
253 |
|
|
* @return |
254 |
|
|
*/ |
255 |
|
|
public String getSequenceNoGaps(int seqNumber){ |
256 |
|
|
String seq = ""; |
257 |
|
|
for (int i=0;i<getSequenceLength();i++){ |
258 |
|
|
char letter = sequences[seqNumber].charAt(i); |
259 |
|
|
if (letter!=GAPCHARACTER){ |
260 |
|
|
seq+=letter; |
261 |
|
|
} |
262 |
|
|
} |
263 |
|
|
return seq; |
264 |
|
|
} |
265 |
|
|
|
266 |
|
|
/** |
267 |
duarte |
244 |
* Given the alignment serial (starting at 0, with gaps), |
268 |
|
|
* returns the sequence serial (starting at 1, no gaps) of sequence seqNumber |
269 |
duarte |
245 |
* Returns -1 if sequence at that position is a gap |
270 |
duarte |
244 |
* @param seqNumber |
271 |
|
|
* @param alignmentSerial |
272 |
|
|
* @return |
273 |
|
|
*/ |
274 |
duarte |
245 |
public int al2seq(int seqNumber, int alignmentSerial){ |
275 |
|
|
return mapAlign2Seq[seqNumber][alignmentSerial]; |
276 |
duarte |
244 |
} |
277 |
|
|
|
278 |
|
|
/** |
279 |
duarte |
245 |
* Given sequence serial of sequence seqNumber, |
280 |
|
|
* returns the alignment serial |
281 |
|
|
* @param seqNumber |
282 |
|
|
* @param seqSerial |
283 |
|
|
* @return |
284 |
|
|
*/ |
285 |
|
|
public int seq2al(int seqNumber, int seqSerial) { |
286 |
|
|
return mapSeq2Align[seqNumber][seqSerial]; |
287 |
|
|
} |
288 |
|
|
|
289 |
|
|
/** |
290 |
duarte |
244 |
* Gets column i of the alignment as a String |
291 |
|
|
* @param i |
292 |
|
|
* @return |
293 |
|
|
*/ |
294 |
|
|
public String getColumn(int i){ |
295 |
|
|
String col=""; |
296 |
|
|
for (String seq:sequences){ |
297 |
|
|
col+=seq.charAt(i); |
298 |
|
|
} |
299 |
|
|
return col; |
300 |
|
|
} |
301 |
|
|
|
302 |
|
|
/** |
303 |
|
|
* Writes alignment by columns in tab delimited format, |
304 |
|
|
* useful to import to MySQL |
305 |
|
|
* |
306 |
|
|
*/ |
307 |
|
|
public void writeTabDelimited(){ |
308 |
|
|
for (int i=0;i<getSequenceLength();i++){ |
309 |
|
|
for (String seq:sequences){ |
310 |
|
|
System.out.print(seq.charAt(i)+"\t"); |
311 |
|
|
} |
312 |
|
|
System.out.print((i+1)+"\t"); |
313 |
|
|
for (int seqNumber=0;seqNumber<getNumberOfSequences();seqNumber++){ |
314 |
duarte |
245 |
int serial = al2seq(seqNumber, i); |
315 |
|
|
if (serial!=-1){ // everything not gaps |
316 |
duarte |
244 |
System.out.print(serial+"\t"); |
317 |
duarte |
245 |
} else { // gaps |
318 |
duarte |
244 |
System.out.print("\\N\t"); |
319 |
|
|
} |
320 |
|
|
} |
321 |
|
|
System.out.println(); |
322 |
|
|
} |
323 |
|
|
} |
324 |
|
|
|
325 |
|
|
/** to test the class */ |
326 |
|
|
public static void main(String[] args) throws FileNotFoundException, IOException { |
327 |
|
|
if (args.length<1){ |
328 |
|
|
System.err.println("Must provide FASTA file name as argument"); |
329 |
|
|
System.exit(1); |
330 |
|
|
} |
331 |
|
|
String fileName=args[0]; |
332 |
|
|
Alignment al = new Alignment(fileName,"FASTA"); |
333 |
|
|
// for (int i=0;i<al.getSequenceLength();i++){ |
334 |
|
|
// System.out.println(al.getColumn(i)); |
335 |
|
|
// } |
336 |
|
|
|
337 |
duarte |
245 |
// for (int i=0;i<al.getNumberOfSequences();i++){ |
338 |
|
|
// System.out.println(al.getSequenceTag(i)); |
339 |
|
|
// System.out.println(al.getSequence(i)); |
340 |
|
|
// } |
341 |
|
|
//for (int i=0;i<al.getSequenceLength();i++) { |
342 |
|
|
//System.out.println("alignment serial: "+i+", seq serial: "+al.getSerialInSequence(0,i)); |
343 |
|
|
//} |
344 |
|
|
for (int serial=1;serial<=al.getSequenceNoGaps(0).length();serial++){ |
345 |
|
|
System.out.println("seq serial: "+serial+", alignment serial: "+al.seq2al(0, serial)); |
346 |
duarte |
244 |
} |
347 |
|
|
//al.writeTabDelimited(); |
348 |
|
|
} |
349 |
|
|
|
350 |
|
|
} |