ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Pdb.java
Revision: 159
Committed: Thu May 24 09:37:25 2007 UTC (17 years, 5 months ago) by duarte
File size: 16832 byte(s)
Log Message:
Moved ContactMap and EdgeState out of this repository. 
Extra functionality of ContactMap now in Graph:
- new member variables fullLength, obsLength, numContacts. New method getIntMatrix
- removed getCM method
Line File contents
1 package proteinstructure;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileOutputStream;
6 import java.io.FileReader;
7 import java.io.PrintStream;
8 import java.io.FileNotFoundException;
9 import java.io.IOException;
10 import java.util.HashMap;
11 import java.util.TreeMap;
12 import java.util.ArrayList;
13 import java.util.Collections;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17 public class Pdb {
18
19 public final static int DEFAULT_MODEL=1;
20
21 HashMap<String,Integer> resser_atom2atomserial;
22 HashMap<Integer,String> resser2restype;
23 HashMap<Integer,Double[]> atomser2coord;
24 HashMap<Integer,Integer> atomser2resser;
25 HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms();
26 public String sequence="";
27 public String accode="";
28 // given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file)
29 public String chaincode="";
30 public int model=DEFAULT_MODEL;
31 String db;
32
33 // Our internal chain identifier (taken from dbs or pdb):
34 // - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd)
35 // - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ')
36 public String chain;
37
38 /**
39 * Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL
40 * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
41 * @param accode
42 * @param chaincode
43 * @throws PdbaseInconsistencyError
44 * @throws PdbaseAcCodeNotFoundError
45 * @throws MsdsdAcCodeNotFoundError
46 * @throws MsdsdInconsistentResidueNumbersError
47 */
48 public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
49 this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB);
50 }
51
52 /**
53 * Constructs Pdb object given accession code, pdb chain code and a model serial
54 * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
55 * @param accode
56 * @param chaincode
57 * @param model_serial
58 * @throws PdbaseInconsistencyError
59 * @throws PdbaseAcCodeNotFoundError
60 * @throws MsdsdAcCodeNotFoundError
61 * @throws MsdsdInconsistentResidueNumbersError
62 */
63 public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
64 this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB);
65 }
66
67 /**
68 * Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL
69 * The db can be pdbase-like or msdsd-like
70 * @param accode
71 * @param chaincode
72 * @param db
73 * @throws PdbaseInconsistencyError
74 * @throws PdbaseAcCodeNotFoundError
75 * @throws MsdsdAcCodeNotFoundError
76 * @throws MsdsdInconsistentResidueNumbersError
77 */
78 public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
79 this(accode,chaincode,DEFAULT_MODEL,db);
80 }
81
82 /**
83 * Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data.
84 * The db can be pdbase-like or msdsd-like
85 * @param accode
86 * @param chaincode
87 * @param model_serial
88 * @param db
89 * @throws PdbaseInconsistencyError
90 * @throws PdbaseAcCodeNotFoundError
91 * @throws MsdsdAcCodeNotFoundError
92 * @throws MsdsdInconsistentResidueNumbersError
93 */
94 public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
95 this.accode=accode;
96 this.chaincode=chaincode;
97 this.model=model_serial;
98 this.db=db;
99 // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode
100 this.chain=chaincode;
101 if (db.contains("msdsd")){
102 read_pdb_data_from_msdsd(db);
103 } else if (db.contains("pdbase")){
104 read_pdb_data_from_pdbase(db);
105 }
106 }
107
108 /**
109 * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
110 * @param pdbfile
111 * @param chaincode
112 * @param dummy dummy parameter to distinguish the method from other with the same signature
113 * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
114 * @throws FileNotFoundException
115 * @throws IOException
116 */
117 public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException {
118 this(pdbfile,chaincode,DEFAULT_MODEL,dummy);
119 }
120
121 /**
122 * Constructs Pdb object reading from pdb file given pdb chain code and model serial
123 * @param pdbfile
124 * @param chaincode
125 * @param model_serial
126 * @param dummy dummy parameter to distinguish the method from other with the same signature
127 * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
128 * @throws FileNotFoundException
129 * @throws IOException
130 */
131 public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException {
132 this.model=model_serial;
133 this.chaincode=chaincode;
134 read_pdb_data_from_file(pdbfile);
135 }
136
137 public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{
138 resser_atom2atomserial = new HashMap<String,Integer>();
139 resser2restype = new HashMap<Integer,String>();
140 atomser2coord = new HashMap<Integer,Double[]>();
141 atomser2resser = new HashMap<Integer,Integer>();
142
143 PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db);
144 ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData();
145 chain = mypdbaseinfo.chain;
146 sequence = mypdbaseinfo.read_seq();
147 mypdbaseinfo.close();
148
149 for (ArrayList result:resultset){
150 int atomserial = (Integer) result.get(0);
151 String atom = (String) result.get(1);
152 String res_type = (String) result.get(2);
153 int res_serial = (Integer) result.get(3);
154 double x = (Double) result.get(4);
155 double y = (Double) result.get(5);
156 double z = (Double) result.get(6);
157 Double[] coords = {x, y, z};
158 ArrayList<String> aalist=AA.aas();
159 if (aalist.contains(res_type)) {
160 atomser2coord.put(atomserial, coords);
161 atomser2resser.put(atomserial, res_serial);
162 resser2restype.put(res_serial, res_type);
163 ArrayList<String> atomlist = aas2atoms.get(res_type);
164 if (atomlist.contains(atom)){
165 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
166 }
167 }
168 }
169 }
170
171 public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
172 resser_atom2atomserial = new HashMap<String,Integer>();
173 resser2restype = new HashMap<Integer,String>();
174 atomser2coord = new HashMap<Integer,Double[]>();
175 atomser2resser = new HashMap<Integer,Integer>();
176
177 MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db);
178 ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData();
179 sequence = mymsdsdinfo.read_seq();
180 chain = mymsdsdinfo.chain;
181 mymsdsdinfo.close();
182
183 for (ArrayList result:resultset){
184 int atomserial = (Integer) result.get(0);
185 String atom = (String) result.get(1);
186 String res_type = (String) result.get(2);
187 int res_serial = (Integer) result.get(3);
188 double x = (Double) result.get(4);
189 double y = (Double) result.get(5);
190 double z = (Double) result.get(6);
191 Double[] coords = {x, y, z};
192 ArrayList<String> aalist=AA.aas();
193 if (aalist.contains(res_type)) {
194 atomser2coord.put(atomserial, coords);
195 atomser2resser.put(atomserial, res_serial);
196 resser2restype.put(res_serial, res_type);
197 ArrayList<String> atomlist = aas2atoms.get(res_type);
198 if (atomlist.contains(atom)){
199 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
200 }
201 }
202 }
203 }
204
205 /**
206 * To read the pdb data (atom coordinates, residue serials, atom serials) from file
207 * chain gets set to internal identifier: if input chain code NULL then chain will be ' '
208 * accode gets set to the one parsed in HEADER or to 'Unknown' if not found
209 * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
210 * @param pdbfile
211 * @throws FileNotFoundException
212 * @throws IOException
213 */
214 public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{
215 resser_atom2atomserial = new HashMap<String,Integer>();
216 resser2restype = new HashMap<Integer,String>();
217 atomser2coord = new HashMap<Integer,Double[]>();
218 atomser2resser = new HashMap<Integer,Integer>();
219 boolean empty = true; // controls whether we don't find any atom line for given chaincode and model
220 // we set chain to chaincode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
221 chain=chaincode;
222 if (chaincode.equals("NULL")) chain=" ";
223 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
224 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
225 String line;
226 while ((line = fpdb.readLine() ) != null ) {
227 Pattern p = Pattern.compile("^HEADER");
228 Matcher m = p.matcher(line);
229 if (m.find()){
230 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
231 Matcher mh = ph.matcher(line);
232 if (mh.find()) {
233 accode=mh.group(1).toLowerCase();
234 } else {
235 accode="Unknown";
236 }
237 }
238 p = Pattern.compile("^MODEL\\s+(\\d+)");
239 m = p.matcher(line);
240 if (m.find()){
241 thismodel=Integer.parseInt(m.group(1));
242 }
243 if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
244 p = Pattern.compile("^ATOM");
245 m = p.matcher(line);
246 if (m.find()){
247 // serial atom res_type chain res_ser x y z
248 Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chain+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
249 Matcher ml = pl.matcher(line);
250 if (ml.find()) {
251 empty=false;
252 int atomserial=Integer.parseInt(ml.group(1).trim());
253 String atom = ml.group(2).trim();
254 String res_type = ml.group(3).trim();
255 int res_serial = Integer.parseInt(ml.group(4).trim());
256 double x = Double.parseDouble(ml.group(5).trim());
257 double y = Double.parseDouble(ml.group(6).trim());
258 double z = Double.parseDouble(ml.group(7).trim());
259 Double[] coords = {x, y, z};
260 ArrayList<String> aalist=AA.aas();
261 if (aalist.contains(res_type)) {
262 atomser2coord.put(atomserial, coords);
263 atomser2resser.put(atomserial, res_serial);
264 resser2restype.put(res_serial, res_type);
265 ArrayList<String> atomlist = aas2atoms.get(res_type);
266 if (atomlist.contains(atom)){
267 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
268 }
269 }
270 }
271 }
272 }
273 fpdb.close();
274 if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model);
275 // now we read the sequence from the resser2restype HashMap
276 // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
277 ArrayList<Integer> ressers = new ArrayList<Integer>();
278 for (int resser:resser2restype.keySet()) {
279 ressers.add(resser);
280 }
281 Collections.sort(ressers);
282 sequence="";
283 for (int resser:ressers){
284 String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
285 sequence += oneletter;
286 }
287 }
288
289 /**
290 * Dumps coordinate data into a file in pdb format (ATOM lines only)
291 * The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects
292 * @param outfile
293 * @throws IOException
294 */
295 public void dump2pdbfile(String outfile) throws IOException {
296 TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>();
297 PrintStream Out = new PrintStream(new FileOutputStream(outfile));
298 Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'");
299 for (String resser_atom:resser_atom2atomserial.keySet()){
300 int atomserial = resser_atom2atomserial.get(resser_atom);
301 int res_serial = Integer.parseInt(resser_atom.split("_")[0]);
302 String atom = resser_atom.split("_")[1];
303 String res_type = resser2restype.get(res_serial);
304 Double[] coords = atomser2coord.get(atomserial);
305 Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]};
306 lines.put(atomserial, fields);
307 }
308 for (int atomserial:lines.keySet()){
309 Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial));
310 }
311 Out.println("END");
312 Out.close();
313 }
314
315 public void dumpseq(String seqfile) throws IOException {
316 PrintStream Out = new PrintStream(new FileOutputStream(seqfile));
317 Out.println(">"+accode+"_"+chaincode);
318 Out.println(sequence);
319 Out.close();
320 }
321
322 public int get_length(){
323 return resser2restype.size();
324 }
325
326 public HashMap<Integer,Double[]> get_coords_for_ct(String ct) {
327 HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>();
328 HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct);
329 for (int resser:resser2restype.keySet()){
330 String[] atoms = restype2atoms.get(resser2restype.get(resser));
331 for (String atom:atoms){
332 if (resser_atom2atomserial.containsKey(resser+"_"+atom)){
333 int atomser = resser_atom2atomserial.get(resser+"_"+atom);
334 Double[] coord = atomser2coord.get(atomser);
335 coords.put(atomser, coord);
336 }
337 else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){
338 int atomser = resser_atom2atomserial.get(resser+"_"+"OXT");
339 Double[] coord = atomser2coord.get(atomser);
340 coords.put(atomser, coord);
341 }
342 else {
343 System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser.");
344 }
345 }
346 }
347 return coords;
348 }
349
350 public TreeMap<Contact, Double> calculate_dist_matrix(String ct){
351 TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>();
352 if (!ct.contains("/")){
353 HashMap<Integer,Double[]> coords = get_coords_for_ct(ct);
354 for (int i_atomser:coords.keySet()){
355 for (int j_atomser:coords.keySet()){
356 if (j_atomser>i_atomser) {
357 Contact pair = new Contact(i_atomser,j_atomser);
358 dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser)));
359 }
360 }
361 }
362 } else {
363 String i_ct = ct.split("/")[0];
364 String j_ct = ct.split("/")[1];
365 HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct);
366 HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct);
367 for (int i_atomser:i_coords.keySet()){
368 for (int j_atomser:j_coords.keySet()){
369 if (j_atomser!=i_atomser){
370 Contact pair = new Contact(i_atomser,j_atomser);
371 dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser)));
372 }
373 }
374 }
375 }
376 return dist_matrix;
377 }
378
379 /**
380 * Get the contacts for given contact type and cutoff for this Pdb object.
381 * Returns a Graph object with the contacts
382 * The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy
383 * ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb)
384 * @param ct
385 * @param cutoff
386 * @return
387 */
388 public Graph get_graph(String ct, double cutoff){
389 TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct);
390 ArrayList<Contact> contacts = new ArrayList<Contact>();
391 // we loop here over all indices of dist_matrix,
392 // we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and
393 // in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix
394 for (Contact pair:dist_matrix.keySet()){
395 int i_atomser=pair.i;
396 int j_atomser=pair.j;
397 if (dist_matrix.get(pair)<=cutoff){
398 int i_resser = atomser2resser.get(i_atomser);
399 int j_resser = atomser2resser.get(j_atomser);
400 Contact resser_pair = new Contact(i_resser,j_resser);
401 // for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates
402 if (i_resser!=j_resser && (! contacts.contains(resser_pair))){
403 contacts.add(resser_pair);
404 }
405 }
406 }
407 Collections.sort(contacts);
408 TreeMap<Integer,String> nodes = new TreeMap<Integer,String>();
409 for (int resser:resser2restype.keySet()){
410 nodes.put(resser,resser2restype.get(resser));
411 }
412 Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain,chaincode);
413 return graph;
414 }
415
416 public Double distance(Double[] coords1, Double[] coords2){
417 return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2));
418 }
419 }