1 |
package proteinstructure; |
2 |
|
3 |
import java.io.BufferedReader; |
4 |
import java.io.File; |
5 |
import java.io.FileOutputStream; |
6 |
import java.io.FileReader; |
7 |
import java.io.PrintStream; |
8 |
import java.io.FileNotFoundException; |
9 |
import java.io.IOException; |
10 |
import java.util.HashMap; |
11 |
import java.util.TreeMap; |
12 |
import java.util.ArrayList; |
13 |
import java.util.Collections; |
14 |
import java.util.regex.Matcher; |
15 |
import java.util.regex.Pattern; |
16 |
|
17 |
public class Pdb { |
18 |
|
19 |
public final static int DEFAULT_MODEL=1; |
20 |
|
21 |
HashMap<String,Integer> resser_atom2atomserial; |
22 |
HashMap<Integer,String> resser2restype; |
23 |
HashMap<Integer,Double[]> atomser2coord; |
24 |
HashMap<Integer,Integer> atomser2resser; |
25 |
HashMap<String,Integer> pdbresser2resser; // only used when reading from database |
26 |
HashMap<Integer,String> resser2pdbresser; // only used when reading from database |
27 |
|
28 |
HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms(); |
29 |
public String sequence=""; |
30 |
public String accode=""; |
31 |
// given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file) |
32 |
public String chaincode=""; |
33 |
public int model=DEFAULT_MODEL; |
34 |
String db; |
35 |
|
36 |
// Our internal chain identifier (taken from dbs or pdb): |
37 |
// - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd) |
38 |
// - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ') |
39 |
public String chain; |
40 |
|
41 |
/** |
42 |
* Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL |
43 |
* Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB |
44 |
* @param accode |
45 |
* @param chaincode |
46 |
* @throws PdbaseInconsistencyError |
47 |
* @throws PdbaseAcCodeNotFoundError |
48 |
* @throws MsdsdAcCodeNotFoundError |
49 |
* @throws MsdsdInconsistentResidueNumbersError |
50 |
*/ |
51 |
public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError { |
52 |
this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB); |
53 |
} |
54 |
|
55 |
/** |
56 |
* Constructs Pdb object given accession code, pdb chain code and a model serial |
57 |
* Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB |
58 |
* @param accode |
59 |
* @param chaincode |
60 |
* @param model_serial |
61 |
* @throws PdbaseInconsistencyError |
62 |
* @throws PdbaseAcCodeNotFoundError |
63 |
* @throws MsdsdAcCodeNotFoundError |
64 |
* @throws MsdsdInconsistentResidueNumbersError |
65 |
*/ |
66 |
public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError { |
67 |
this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB); |
68 |
} |
69 |
|
70 |
/** |
71 |
* Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL |
72 |
* The db can be pdbase-like or msdsd-like |
73 |
* @param accode |
74 |
* @param chaincode |
75 |
* @param db |
76 |
* @throws PdbaseInconsistencyError |
77 |
* @throws PdbaseAcCodeNotFoundError |
78 |
* @throws MsdsdAcCodeNotFoundError |
79 |
* @throws MsdsdInconsistentResidueNumbersError |
80 |
*/ |
81 |
public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError { |
82 |
this(accode,chaincode,DEFAULT_MODEL,db); |
83 |
} |
84 |
|
85 |
/** |
86 |
* Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data. |
87 |
* The db can be pdbase-like or msdsd-like |
88 |
* @param accode |
89 |
* @param chaincode |
90 |
* @param model_serial |
91 |
* @param db |
92 |
* @throws PdbaseInconsistencyError |
93 |
* @throws PdbaseAcCodeNotFoundError |
94 |
* @throws MsdsdAcCodeNotFoundError |
95 |
* @throws MsdsdInconsistentResidueNumbersError |
96 |
*/ |
97 |
public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError { |
98 |
this.accode=accode; |
99 |
this.chaincode=chaincode; |
100 |
this.model=model_serial; |
101 |
this.db=db; |
102 |
// we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode |
103 |
this.chain=chaincode; |
104 |
if (db.contains("msdsd")){ |
105 |
read_pdb_data_from_msdsd(db); |
106 |
} else if (db.contains("pdbase")){ |
107 |
read_pdb_data_from_pdbase(db); |
108 |
} |
109 |
// we initialise resser2pdbresser from the pdbresser2resser HashMap |
110 |
resser2pdbresser = new HashMap<Integer, String>(); |
111 |
for (String pdbresser:pdbresser2resser.keySet()){ |
112 |
resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser); |
113 |
} |
114 |
} |
115 |
|
116 |
/** |
117 |
* Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL |
118 |
* @param pdbfile |
119 |
* @param chaincode |
120 |
* @param dummy dummy parameter to distinguish the method from other with the same signature |
121 |
* TODO dummy parameter is a dirty hack, must solve it in other way: subclassing |
122 |
* @throws FileNotFoundException |
123 |
* @throws IOException |
124 |
*/ |
125 |
public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException { |
126 |
this(pdbfile,chaincode,DEFAULT_MODEL,dummy); |
127 |
} |
128 |
|
129 |
/** |
130 |
* Constructs Pdb object reading from pdb file given pdb chain code and model serial |
131 |
* @param pdbfile |
132 |
* @param chaincode |
133 |
* @param model_serial |
134 |
* @param dummy dummy parameter to distinguish the method from other with the same signature |
135 |
* TODO dummy parameter is a dirty hack, must solve it in other way: subclassing |
136 |
* @throws FileNotFoundException |
137 |
* @throws IOException |
138 |
*/ |
139 |
public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException { |
140 |
this.model=model_serial; |
141 |
this.chaincode=chaincode; |
142 |
read_pdb_data_from_file(pdbfile); |
143 |
} |
144 |
|
145 |
public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{ |
146 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
147 |
resser2restype = new HashMap<Integer,String>(); |
148 |
atomser2coord = new HashMap<Integer,Double[]>(); |
149 |
atomser2resser = new HashMap<Integer,Integer>(); |
150 |
|
151 |
PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db); |
152 |
ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData(); |
153 |
chain = mypdbaseinfo.chain; |
154 |
sequence = mypdbaseinfo.read_seq(); |
155 |
pdbresser2resser = mypdbaseinfo.get_ressers_mapping(); |
156 |
mypdbaseinfo.close(); |
157 |
|
158 |
for (ArrayList result:resultset){ |
159 |
int atomserial = (Integer) result.get(0); |
160 |
String atom = (String) result.get(1); |
161 |
String res_type = (String) result.get(2); |
162 |
int res_serial = (Integer) result.get(3); |
163 |
double x = (Double) result.get(4); |
164 |
double y = (Double) result.get(5); |
165 |
double z = (Double) result.get(6); |
166 |
Double[] coords = {x, y, z}; |
167 |
ArrayList<String> aalist=AA.aas(); |
168 |
if (aalist.contains(res_type)) { |
169 |
atomser2coord.put(atomserial, coords); |
170 |
atomser2resser.put(atomserial, res_serial); |
171 |
resser2restype.put(res_serial, res_type); |
172 |
ArrayList<String> atomlist = aas2atoms.get(res_type); |
173 |
if (atomlist.contains(atom)){ |
174 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
175 |
} |
176 |
} |
177 |
} |
178 |
} |
179 |
|
180 |
public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError { |
181 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
182 |
resser2restype = new HashMap<Integer,String>(); |
183 |
atomser2coord = new HashMap<Integer,Double[]>(); |
184 |
atomser2resser = new HashMap<Integer,Integer>(); |
185 |
|
186 |
MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db); |
187 |
ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData(); |
188 |
sequence = mymsdsdinfo.read_seq(); |
189 |
chain = mymsdsdinfo.chain; |
190 |
pdbresser2resser = mymsdsdinfo.get_ressers_mapping(); |
191 |
mymsdsdinfo.close(); |
192 |
|
193 |
for (ArrayList result:resultset){ |
194 |
int atomserial = (Integer) result.get(0); |
195 |
String atom = (String) result.get(1); |
196 |
String res_type = (String) result.get(2); |
197 |
int res_serial = (Integer) result.get(3); |
198 |
double x = (Double) result.get(4); |
199 |
double y = (Double) result.get(5); |
200 |
double z = (Double) result.get(6); |
201 |
Double[] coords = {x, y, z}; |
202 |
ArrayList<String> aalist=AA.aas(); |
203 |
if (aalist.contains(res_type)) { |
204 |
atomser2coord.put(atomserial, coords); |
205 |
atomser2resser.put(atomserial, res_serial); |
206 |
resser2restype.put(res_serial, res_type); |
207 |
ArrayList<String> atomlist = aas2atoms.get(res_type); |
208 |
if (atomlist.contains(atom)){ |
209 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
210 |
} |
211 |
} |
212 |
} |
213 |
} |
214 |
|
215 |
/** |
216 |
* To read the pdb data (atom coordinates, residue serials, atom serials) from file |
217 |
* chain gets set to internal identifier: if input chain code NULL then chain will be ' ' |
218 |
* accode gets set to the one parsed in HEADER or to 'Unknown' if not found |
219 |
* sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only) |
220 |
* @param pdbfile |
221 |
* @throws FileNotFoundException |
222 |
* @throws IOException |
223 |
*/ |
224 |
public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{ |
225 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
226 |
resser2restype = new HashMap<Integer,String>(); |
227 |
atomser2coord = new HashMap<Integer,Double[]>(); |
228 |
atomser2resser = new HashMap<Integer,Integer>(); |
229 |
boolean empty = true; // controls whether we don't find any atom line for given chaincode and model |
230 |
// we set chain to chaincode except for case NULL where we use " " (NULL is a blank chain code in pdb files) |
231 |
chain=chaincode; |
232 |
if (chaincode.equals("NULL")) chain=" "; |
233 |
int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines |
234 |
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
235 |
String line; |
236 |
while ((line = fpdb.readLine() ) != null ) { |
237 |
Pattern p = Pattern.compile("^HEADER"); |
238 |
Matcher m = p.matcher(line); |
239 |
if (m.find()){ |
240 |
Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})"); |
241 |
Matcher mh = ph.matcher(line); |
242 |
if (mh.find()) { |
243 |
accode=mh.group(1).toLowerCase(); |
244 |
} else { |
245 |
accode="Unknown"; |
246 |
} |
247 |
} |
248 |
p = Pattern.compile("^MODEL\\s+(\\d+)"); |
249 |
m = p.matcher(line); |
250 |
if (m.find()){ |
251 |
thismodel=Integer.parseInt(m.group(1)); |
252 |
} |
253 |
if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model |
254 |
p = Pattern.compile("^ATOM"); |
255 |
m = p.matcher(line); |
256 |
if (m.find()){ |
257 |
// serial atom res_type chain res_ser x y z |
258 |
Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chain+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE); |
259 |
Matcher ml = pl.matcher(line); |
260 |
if (ml.find()) { |
261 |
empty=false; |
262 |
int atomserial=Integer.parseInt(ml.group(1).trim()); |
263 |
String atom = ml.group(2).trim(); |
264 |
String res_type = ml.group(3).trim(); |
265 |
int res_serial = Integer.parseInt(ml.group(4).trim()); |
266 |
double x = Double.parseDouble(ml.group(5).trim()); |
267 |
double y = Double.parseDouble(ml.group(6).trim()); |
268 |
double z = Double.parseDouble(ml.group(7).trim()); |
269 |
Double[] coords = {x, y, z}; |
270 |
ArrayList<String> aalist=AA.aas(); |
271 |
if (aalist.contains(res_type)) { |
272 |
atomser2coord.put(atomserial, coords); |
273 |
atomser2resser.put(atomserial, res_serial); |
274 |
resser2restype.put(res_serial, res_type); |
275 |
ArrayList<String> atomlist = aas2atoms.get(res_type); |
276 |
if (atomlist.contains(atom)){ |
277 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
278 |
} |
279 |
} |
280 |
} |
281 |
} |
282 |
} |
283 |
fpdb.close(); |
284 |
if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model); |
285 |
// now we read the sequence from the resser2restype HashMap |
286 |
// NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here! |
287 |
ArrayList<Integer> ressers = new ArrayList<Integer>(); |
288 |
for (int resser:resser2restype.keySet()) { |
289 |
ressers.add(resser); |
290 |
} |
291 |
Collections.sort(ressers); |
292 |
sequence=""; |
293 |
for (int resser:ressers){ |
294 |
String oneletter = AA.threeletter2oneletter(resser2restype.get(resser)); |
295 |
sequence += oneletter; |
296 |
} |
297 |
} |
298 |
|
299 |
/** |
300 |
* Dumps coordinate data into a file in pdb format (ATOM lines only) |
301 |
* The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects |
302 |
* @param outfile |
303 |
* @throws IOException |
304 |
*/ |
305 |
public void dump2pdbfile(String outfile) throws IOException { |
306 |
TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>(); |
307 |
PrintStream Out = new PrintStream(new FileOutputStream(outfile)); |
308 |
Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'"); |
309 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
310 |
int atomserial = resser_atom2atomserial.get(resser_atom); |
311 |
int res_serial = Integer.parseInt(resser_atom.split("_")[0]); |
312 |
String atom = resser_atom.split("_")[1]; |
313 |
String res_type = resser2restype.get(res_serial); |
314 |
Double[] coords = atomser2coord.get(atomserial); |
315 |
Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]}; |
316 |
lines.put(atomserial, fields); |
317 |
} |
318 |
for (int atomserial:lines.keySet()){ |
319 |
Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial)); |
320 |
} |
321 |
Out.println("END"); |
322 |
Out.close(); |
323 |
} |
324 |
|
325 |
public void dumpseq(String seqfile) throws IOException { |
326 |
PrintStream Out = new PrintStream(new FileOutputStream(seqfile)); |
327 |
Out.println(">"+accode+"_"+chaincode); |
328 |
Out.println(sequence); |
329 |
Out.close(); |
330 |
} |
331 |
|
332 |
public int get_length(){ |
333 |
return resser2restype.size(); |
334 |
} |
335 |
|
336 |
public HashMap<Integer,Double[]> get_coords_for_ct(String ct) { |
337 |
HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>(); |
338 |
HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct); |
339 |
for (int resser:resser2restype.keySet()){ |
340 |
String[] atoms = restype2atoms.get(resser2restype.get(resser)); |
341 |
for (String atom:atoms){ |
342 |
if (resser_atom2atomserial.containsKey(resser+"_"+atom)){ |
343 |
int atomser = resser_atom2atomserial.get(resser+"_"+atom); |
344 |
Double[] coord = atomser2coord.get(atomser); |
345 |
coords.put(atomser, coord); |
346 |
} |
347 |
else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){ |
348 |
int atomser = resser_atom2atomserial.get(resser+"_"+"OXT"); |
349 |
Double[] coord = atomser2coord.get(atomser); |
350 |
coords.put(atomser, coord); |
351 |
} |
352 |
else { |
353 |
System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser."); |
354 |
} |
355 |
} |
356 |
} |
357 |
return coords; |
358 |
} |
359 |
|
360 |
public TreeMap<Contact, Double> calculate_dist_matrix(String ct){ |
361 |
TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>(); |
362 |
if (!ct.contains("/")){ |
363 |
HashMap<Integer,Double[]> coords = get_coords_for_ct(ct); |
364 |
for (int i_atomser:coords.keySet()){ |
365 |
for (int j_atomser:coords.keySet()){ |
366 |
if (j_atomser>i_atomser) { |
367 |
Contact pair = new Contact(i_atomser,j_atomser); |
368 |
dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser))); |
369 |
} |
370 |
} |
371 |
} |
372 |
} else { |
373 |
String i_ct = ct.split("/")[0]; |
374 |
String j_ct = ct.split("/")[1]; |
375 |
HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct); |
376 |
HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct); |
377 |
for (int i_atomser:i_coords.keySet()){ |
378 |
for (int j_atomser:j_coords.keySet()){ |
379 |
if (j_atomser!=i_atomser){ |
380 |
Contact pair = new Contact(i_atomser,j_atomser); |
381 |
dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser))); |
382 |
} |
383 |
} |
384 |
} |
385 |
} |
386 |
return dist_matrix; |
387 |
} |
388 |
|
389 |
/** |
390 |
* Get the contacts for given contact type and cutoff for this Pdb object. |
391 |
* Returns a Graph object with the contacts |
392 |
* The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy |
393 |
* ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb) |
394 |
* @param ct |
395 |
* @param cutoff |
396 |
* @return |
397 |
*/ |
398 |
@SuppressWarnings("unchecked") |
399 |
public Graph get_graph(String ct, double cutoff){ |
400 |
TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct); |
401 |
ContactList contacts = new ContactList(); |
402 |
// we loop here over all indices of dist_matrix, |
403 |
// we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and |
404 |
// in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix |
405 |
for (Contact pair:dist_matrix.keySet()){ |
406 |
int i_atomser=pair.i; |
407 |
int j_atomser=pair.j; |
408 |
if (dist_matrix.get(pair)<=cutoff){ |
409 |
int i_resser = atomser2resser.get(i_atomser); |
410 |
int j_resser = atomser2resser.get(j_atomser); |
411 |
Contact resser_pair = new Contact(i_resser,j_resser); |
412 |
// for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates |
413 |
if (i_resser!=j_resser && (! contacts.contains(resser_pair))){ |
414 |
contacts.add(resser_pair); |
415 |
} |
416 |
} |
417 |
} |
418 |
Collections.sort(contacts); |
419 |
TreeMap<Integer,String> nodes = new TreeMap<Integer,String>(); |
420 |
for (int resser:resser2restype.keySet()){ |
421 |
nodes.put(resser,resser2restype.get(resser)); |
422 |
} |
423 |
Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain,chaincode); |
424 |
return graph; |
425 |
} |
426 |
|
427 |
public Double distance(Double[] coords1, Double[] coords2){ |
428 |
return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2)); |
429 |
} |
430 |
|
431 |
public int get_resser_from_pdbresser (String pdbresser){ |
432 |
return pdbresser2resser.get(pdbresser); |
433 |
} |
434 |
|
435 |
public String get_pdbresser_from_resser (int resser){ |
436 |
return resser2pdbresser.get(resser); |
437 |
} |
438 |
|
439 |
} |