ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Pdb.java
Revision: 190
Committed: Tue Jun 12 13:38:46 2007 UTC (17 years, 4 months ago) by duarte
File size: 17584 byte(s)
Log Message:
NEW FEATURE: mapping of pdb residue serials and internal serials in the Pdb object

Line File contents
1 package proteinstructure;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileOutputStream;
6 import java.io.FileReader;
7 import java.io.PrintStream;
8 import java.io.FileNotFoundException;
9 import java.io.IOException;
10 import java.util.HashMap;
11 import java.util.TreeMap;
12 import java.util.ArrayList;
13 import java.util.Collections;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17 public class Pdb {
18
19 public final static int DEFAULT_MODEL=1;
20
21 HashMap<String,Integer> resser_atom2atomserial;
22 HashMap<Integer,String> resser2restype;
23 HashMap<Integer,Double[]> atomser2coord;
24 HashMap<Integer,Integer> atomser2resser;
25 HashMap<String,Integer> pdbresser2resser; // only used when reading from database
26 HashMap<Integer,String> resser2pdbresser; // only used when reading from database
27
28 HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms();
29 public String sequence="";
30 public String accode="";
31 // given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file)
32 public String chaincode="";
33 public int model=DEFAULT_MODEL;
34 String db;
35
36 // Our internal chain identifier (taken from dbs or pdb):
37 // - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd)
38 // - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ')
39 public String chain;
40
41 /**
42 * Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL
43 * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
44 * @param accode
45 * @param chaincode
46 * @throws PdbaseInconsistencyError
47 * @throws PdbaseAcCodeNotFoundError
48 * @throws MsdsdAcCodeNotFoundError
49 * @throws MsdsdInconsistentResidueNumbersError
50 */
51 public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
52 this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB);
53 }
54
55 /**
56 * Constructs Pdb object given accession code, pdb chain code and a model serial
57 * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
58 * @param accode
59 * @param chaincode
60 * @param model_serial
61 * @throws PdbaseInconsistencyError
62 * @throws PdbaseAcCodeNotFoundError
63 * @throws MsdsdAcCodeNotFoundError
64 * @throws MsdsdInconsistentResidueNumbersError
65 */
66 public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
67 this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB);
68 }
69
70 /**
71 * Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL
72 * The db can be pdbase-like or msdsd-like
73 * @param accode
74 * @param chaincode
75 * @param db
76 * @throws PdbaseInconsistencyError
77 * @throws PdbaseAcCodeNotFoundError
78 * @throws MsdsdAcCodeNotFoundError
79 * @throws MsdsdInconsistentResidueNumbersError
80 */
81 public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
82 this(accode,chaincode,DEFAULT_MODEL,db);
83 }
84
85 /**
86 * Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data.
87 * The db can be pdbase-like or msdsd-like
88 * @param accode
89 * @param chaincode
90 * @param model_serial
91 * @param db
92 * @throws PdbaseInconsistencyError
93 * @throws PdbaseAcCodeNotFoundError
94 * @throws MsdsdAcCodeNotFoundError
95 * @throws MsdsdInconsistentResidueNumbersError
96 */
97 public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
98 this.accode=accode;
99 this.chaincode=chaincode;
100 this.model=model_serial;
101 this.db=db;
102 // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode
103 this.chain=chaincode;
104 if (db.contains("msdsd")){
105 read_pdb_data_from_msdsd(db);
106 } else if (db.contains("pdbase")){
107 read_pdb_data_from_pdbase(db);
108 }
109 // we initialise resser2pdbresser from the pdbresser2resser HashMap
110 resser2pdbresser = new HashMap<Integer, String>();
111 for (String pdbresser:pdbresser2resser.keySet()){
112 resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
113 }
114 }
115
116 /**
117 * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
118 * @param pdbfile
119 * @param chaincode
120 * @param dummy dummy parameter to distinguish the method from other with the same signature
121 * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
122 * @throws FileNotFoundException
123 * @throws IOException
124 */
125 public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException {
126 this(pdbfile,chaincode,DEFAULT_MODEL,dummy);
127 }
128
129 /**
130 * Constructs Pdb object reading from pdb file given pdb chain code and model serial
131 * @param pdbfile
132 * @param chaincode
133 * @param model_serial
134 * @param dummy dummy parameter to distinguish the method from other with the same signature
135 * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
136 * @throws FileNotFoundException
137 * @throws IOException
138 */
139 public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException {
140 this.model=model_serial;
141 this.chaincode=chaincode;
142 read_pdb_data_from_file(pdbfile);
143 }
144
145 public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{
146 resser_atom2atomserial = new HashMap<String,Integer>();
147 resser2restype = new HashMap<Integer,String>();
148 atomser2coord = new HashMap<Integer,Double[]>();
149 atomser2resser = new HashMap<Integer,Integer>();
150
151 PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db);
152 ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData();
153 chain = mypdbaseinfo.chain;
154 sequence = mypdbaseinfo.read_seq();
155 pdbresser2resser = mypdbaseinfo.get_ressers_mapping();
156 mypdbaseinfo.close();
157
158 for (ArrayList result:resultset){
159 int atomserial = (Integer) result.get(0);
160 String atom = (String) result.get(1);
161 String res_type = (String) result.get(2);
162 int res_serial = (Integer) result.get(3);
163 double x = (Double) result.get(4);
164 double y = (Double) result.get(5);
165 double z = (Double) result.get(6);
166 Double[] coords = {x, y, z};
167 ArrayList<String> aalist=AA.aas();
168 if (aalist.contains(res_type)) {
169 atomser2coord.put(atomserial, coords);
170 atomser2resser.put(atomserial, res_serial);
171 resser2restype.put(res_serial, res_type);
172 ArrayList<String> atomlist = aas2atoms.get(res_type);
173 if (atomlist.contains(atom)){
174 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
175 }
176 }
177 }
178 }
179
180 public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
181 resser_atom2atomserial = new HashMap<String,Integer>();
182 resser2restype = new HashMap<Integer,String>();
183 atomser2coord = new HashMap<Integer,Double[]>();
184 atomser2resser = new HashMap<Integer,Integer>();
185
186 MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db);
187 ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData();
188 sequence = mymsdsdinfo.read_seq();
189 chain = mymsdsdinfo.chain;
190 pdbresser2resser = mymsdsdinfo.get_ressers_mapping();
191 mymsdsdinfo.close();
192
193 for (ArrayList result:resultset){
194 int atomserial = (Integer) result.get(0);
195 String atom = (String) result.get(1);
196 String res_type = (String) result.get(2);
197 int res_serial = (Integer) result.get(3);
198 double x = (Double) result.get(4);
199 double y = (Double) result.get(5);
200 double z = (Double) result.get(6);
201 Double[] coords = {x, y, z};
202 ArrayList<String> aalist=AA.aas();
203 if (aalist.contains(res_type)) {
204 atomser2coord.put(atomserial, coords);
205 atomser2resser.put(atomserial, res_serial);
206 resser2restype.put(res_serial, res_type);
207 ArrayList<String> atomlist = aas2atoms.get(res_type);
208 if (atomlist.contains(atom)){
209 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
210 }
211 }
212 }
213 }
214
215 /**
216 * To read the pdb data (atom coordinates, residue serials, atom serials) from file
217 * chain gets set to internal identifier: if input chain code NULL then chain will be ' '
218 * accode gets set to the one parsed in HEADER or to 'Unknown' if not found
219 * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
220 * @param pdbfile
221 * @throws FileNotFoundException
222 * @throws IOException
223 */
224 public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{
225 resser_atom2atomserial = new HashMap<String,Integer>();
226 resser2restype = new HashMap<Integer,String>();
227 atomser2coord = new HashMap<Integer,Double[]>();
228 atomser2resser = new HashMap<Integer,Integer>();
229 boolean empty = true; // controls whether we don't find any atom line for given chaincode and model
230 // we set chain to chaincode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
231 chain=chaincode;
232 if (chaincode.equals("NULL")) chain=" ";
233 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
234 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
235 String line;
236 while ((line = fpdb.readLine() ) != null ) {
237 Pattern p = Pattern.compile("^HEADER");
238 Matcher m = p.matcher(line);
239 if (m.find()){
240 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
241 Matcher mh = ph.matcher(line);
242 if (mh.find()) {
243 accode=mh.group(1).toLowerCase();
244 } else {
245 accode="Unknown";
246 }
247 }
248 p = Pattern.compile("^MODEL\\s+(\\d+)");
249 m = p.matcher(line);
250 if (m.find()){
251 thismodel=Integer.parseInt(m.group(1));
252 }
253 if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
254 p = Pattern.compile("^ATOM");
255 m = p.matcher(line);
256 if (m.find()){
257 // serial atom res_type chain res_ser x y z
258 Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chain+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
259 Matcher ml = pl.matcher(line);
260 if (ml.find()) {
261 empty=false;
262 int atomserial=Integer.parseInt(ml.group(1).trim());
263 String atom = ml.group(2).trim();
264 String res_type = ml.group(3).trim();
265 int res_serial = Integer.parseInt(ml.group(4).trim());
266 double x = Double.parseDouble(ml.group(5).trim());
267 double y = Double.parseDouble(ml.group(6).trim());
268 double z = Double.parseDouble(ml.group(7).trim());
269 Double[] coords = {x, y, z};
270 ArrayList<String> aalist=AA.aas();
271 if (aalist.contains(res_type)) {
272 atomser2coord.put(atomserial, coords);
273 atomser2resser.put(atomserial, res_serial);
274 resser2restype.put(res_serial, res_type);
275 ArrayList<String> atomlist = aas2atoms.get(res_type);
276 if (atomlist.contains(atom)){
277 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
278 }
279 }
280 }
281 }
282 }
283 fpdb.close();
284 if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model);
285 // now we read the sequence from the resser2restype HashMap
286 // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
287 ArrayList<Integer> ressers = new ArrayList<Integer>();
288 for (int resser:resser2restype.keySet()) {
289 ressers.add(resser);
290 }
291 Collections.sort(ressers);
292 sequence="";
293 for (int resser:ressers){
294 String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
295 sequence += oneletter;
296 }
297 }
298
299 /**
300 * Dumps coordinate data into a file in pdb format (ATOM lines only)
301 * The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects
302 * @param outfile
303 * @throws IOException
304 */
305 public void dump2pdbfile(String outfile) throws IOException {
306 TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>();
307 PrintStream Out = new PrintStream(new FileOutputStream(outfile));
308 Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'");
309 for (String resser_atom:resser_atom2atomserial.keySet()){
310 int atomserial = resser_atom2atomserial.get(resser_atom);
311 int res_serial = Integer.parseInt(resser_atom.split("_")[0]);
312 String atom = resser_atom.split("_")[1];
313 String res_type = resser2restype.get(res_serial);
314 Double[] coords = atomser2coord.get(atomserial);
315 Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]};
316 lines.put(atomserial, fields);
317 }
318 for (int atomserial:lines.keySet()){
319 Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial));
320 }
321 Out.println("END");
322 Out.close();
323 }
324
325 public void dumpseq(String seqfile) throws IOException {
326 PrintStream Out = new PrintStream(new FileOutputStream(seqfile));
327 Out.println(">"+accode+"_"+chaincode);
328 Out.println(sequence);
329 Out.close();
330 }
331
332 public int get_length(){
333 return resser2restype.size();
334 }
335
336 public HashMap<Integer,Double[]> get_coords_for_ct(String ct) {
337 HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>();
338 HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct);
339 for (int resser:resser2restype.keySet()){
340 String[] atoms = restype2atoms.get(resser2restype.get(resser));
341 for (String atom:atoms){
342 if (resser_atom2atomserial.containsKey(resser+"_"+atom)){
343 int atomser = resser_atom2atomserial.get(resser+"_"+atom);
344 Double[] coord = atomser2coord.get(atomser);
345 coords.put(atomser, coord);
346 }
347 else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){
348 int atomser = resser_atom2atomserial.get(resser+"_"+"OXT");
349 Double[] coord = atomser2coord.get(atomser);
350 coords.put(atomser, coord);
351 }
352 else {
353 System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser.");
354 }
355 }
356 }
357 return coords;
358 }
359
360 public TreeMap<Contact, Double> calculate_dist_matrix(String ct){
361 TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>();
362 if (!ct.contains("/")){
363 HashMap<Integer,Double[]> coords = get_coords_for_ct(ct);
364 for (int i_atomser:coords.keySet()){
365 for (int j_atomser:coords.keySet()){
366 if (j_atomser>i_atomser) {
367 Contact pair = new Contact(i_atomser,j_atomser);
368 dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser)));
369 }
370 }
371 }
372 } else {
373 String i_ct = ct.split("/")[0];
374 String j_ct = ct.split("/")[1];
375 HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct);
376 HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct);
377 for (int i_atomser:i_coords.keySet()){
378 for (int j_atomser:j_coords.keySet()){
379 if (j_atomser!=i_atomser){
380 Contact pair = new Contact(i_atomser,j_atomser);
381 dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser)));
382 }
383 }
384 }
385 }
386 return dist_matrix;
387 }
388
389 /**
390 * Get the contacts for given contact type and cutoff for this Pdb object.
391 * Returns a Graph object with the contacts
392 * The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy
393 * ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb)
394 * @param ct
395 * @param cutoff
396 * @return
397 */
398 @SuppressWarnings("unchecked")
399 public Graph get_graph(String ct, double cutoff){
400 TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct);
401 ContactList contacts = new ContactList();
402 // we loop here over all indices of dist_matrix,
403 // we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and
404 // in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix
405 for (Contact pair:dist_matrix.keySet()){
406 int i_atomser=pair.i;
407 int j_atomser=pair.j;
408 if (dist_matrix.get(pair)<=cutoff){
409 int i_resser = atomser2resser.get(i_atomser);
410 int j_resser = atomser2resser.get(j_atomser);
411 Contact resser_pair = new Contact(i_resser,j_resser);
412 // for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates
413 if (i_resser!=j_resser && (! contacts.contains(resser_pair))){
414 contacts.add(resser_pair);
415 }
416 }
417 }
418 Collections.sort(contacts);
419 TreeMap<Integer,String> nodes = new TreeMap<Integer,String>();
420 for (int resser:resser2restype.keySet()){
421 nodes.put(resser,resser2restype.get(resser));
422 }
423 Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain,chaincode);
424 return graph;
425 }
426
427 public Double distance(Double[] coords1, Double[] coords2){
428 return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2));
429 }
430
431 public int get_resser_from_pdbresser (String pdbresser){
432 return pdbresser2resser.get(pdbresser);
433 }
434
435 public String get_pdbresser_from_resser (int resser){
436 return resser2pdbresser.get(resser);
437 }
438
439 }