ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Pdb.java
Revision: 175
Committed: Fri May 25 17:31:58 2007 UTC (17 years, 4 months ago) by duarte
File size: 16818 byte(s)
Log Message:
FIXED BUG in initialisation of fullLength when reading from file. In case of no sequence and nodes provided, was not getting correctly the maximum value for contacts. Now using the method getMaxContact from new class ContactList
NEW FUNCTIONALITY in Graph:
-New member variable modified
-New methods addEdge, delEdge, restrictContactsToMaxRange, restrictContactsToMinRange, getContacts, getNodes, copy
-Improved slightly the implementation of getEdgeNbh 
FIXED BUG in initialisation of fullLenght when reading from file. In case of no sequence and nodes provided, was not getting correctly the maximum value for contacts. Now using the method getMaxContact from ContactList
New class ContactList
NEW FUNCTIONALITY in Graph:
-New member variable modified
-New methods addEdge, delEdge, restrictContactsToMaxRange, restrictContactsToMinRange, getContacts, getNodes, copy
-Improved slightly the implementation of getEdgeNbh 
New method getRange in Contact
Line File contents
1 package proteinstructure;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileOutputStream;
6 import java.io.FileReader;
7 import java.io.PrintStream;
8 import java.io.FileNotFoundException;
9 import java.io.IOException;
10 import java.util.HashMap;
11 import java.util.TreeMap;
12 import java.util.ArrayList;
13 import java.util.Collections;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17 public class Pdb {
18
19 public final static int DEFAULT_MODEL=1;
20
21 HashMap<String,Integer> resser_atom2atomserial;
22 HashMap<Integer,String> resser2restype;
23 HashMap<Integer,Double[]> atomser2coord;
24 HashMap<Integer,Integer> atomser2resser;
25 HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms();
26 public String sequence="";
27 public String accode="";
28 // given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file)
29 public String chaincode="";
30 public int model=DEFAULT_MODEL;
31 String db;
32
33 // Our internal chain identifier (taken from dbs or pdb):
34 // - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd)
35 // - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ')
36 public String chain;
37
38 /**
39 * Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL
40 * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
41 * @param accode
42 * @param chaincode
43 * @throws PdbaseInconsistencyError
44 * @throws PdbaseAcCodeNotFoundError
45 * @throws MsdsdAcCodeNotFoundError
46 * @throws MsdsdInconsistentResidueNumbersError
47 */
48 public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
49 this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB);
50 }
51
52 /**
53 * Constructs Pdb object given accession code, pdb chain code and a model serial
54 * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
55 * @param accode
56 * @param chaincode
57 * @param model_serial
58 * @throws PdbaseInconsistencyError
59 * @throws PdbaseAcCodeNotFoundError
60 * @throws MsdsdAcCodeNotFoundError
61 * @throws MsdsdInconsistentResidueNumbersError
62 */
63 public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
64 this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB);
65 }
66
67 /**
68 * Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL
69 * The db can be pdbase-like or msdsd-like
70 * @param accode
71 * @param chaincode
72 * @param db
73 * @throws PdbaseInconsistencyError
74 * @throws PdbaseAcCodeNotFoundError
75 * @throws MsdsdAcCodeNotFoundError
76 * @throws MsdsdInconsistentResidueNumbersError
77 */
78 public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
79 this(accode,chaincode,DEFAULT_MODEL,db);
80 }
81
82 /**
83 * Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data.
84 * The db can be pdbase-like or msdsd-like
85 * @param accode
86 * @param chaincode
87 * @param model_serial
88 * @param db
89 * @throws PdbaseInconsistencyError
90 * @throws PdbaseAcCodeNotFoundError
91 * @throws MsdsdAcCodeNotFoundError
92 * @throws MsdsdInconsistentResidueNumbersError
93 */
94 public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
95 this.accode=accode;
96 this.chaincode=chaincode;
97 this.model=model_serial;
98 this.db=db;
99 // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode
100 this.chain=chaincode;
101 if (db.contains("msdsd")){
102 read_pdb_data_from_msdsd(db);
103 } else if (db.contains("pdbase")){
104 read_pdb_data_from_pdbase(db);
105 }
106 }
107
108 /**
109 * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
110 * @param pdbfile
111 * @param chaincode
112 * @param dummy dummy parameter to distinguish the method from other with the same signature
113 * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
114 * @throws FileNotFoundException
115 * @throws IOException
116 */
117 public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException {
118 this(pdbfile,chaincode,DEFAULT_MODEL,dummy);
119 }
120
121 /**
122 * Constructs Pdb object reading from pdb file given pdb chain code and model serial
123 * @param pdbfile
124 * @param chaincode
125 * @param model_serial
126 * @param dummy dummy parameter to distinguish the method from other with the same signature
127 * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
128 * @throws FileNotFoundException
129 * @throws IOException
130 */
131 public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException {
132 this.model=model_serial;
133 this.chaincode=chaincode;
134 read_pdb_data_from_file(pdbfile);
135 }
136
137 public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{
138 resser_atom2atomserial = new HashMap<String,Integer>();
139 resser2restype = new HashMap<Integer,String>();
140 atomser2coord = new HashMap<Integer,Double[]>();
141 atomser2resser = new HashMap<Integer,Integer>();
142
143 PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db);
144 ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData();
145 chain = mypdbaseinfo.chain;
146 sequence = mypdbaseinfo.read_seq();
147 mypdbaseinfo.close();
148
149 for (ArrayList result:resultset){
150 int atomserial = (Integer) result.get(0);
151 String atom = (String) result.get(1);
152 String res_type = (String) result.get(2);
153 int res_serial = (Integer) result.get(3);
154 double x = (Double) result.get(4);
155 double y = (Double) result.get(5);
156 double z = (Double) result.get(6);
157 Double[] coords = {x, y, z};
158 ArrayList<String> aalist=AA.aas();
159 if (aalist.contains(res_type)) {
160 atomser2coord.put(atomserial, coords);
161 atomser2resser.put(atomserial, res_serial);
162 resser2restype.put(res_serial, res_type);
163 ArrayList<String> atomlist = aas2atoms.get(res_type);
164 if (atomlist.contains(atom)){
165 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
166 }
167 }
168 }
169 }
170
171 public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
172 resser_atom2atomserial = new HashMap<String,Integer>();
173 resser2restype = new HashMap<Integer,String>();
174 atomser2coord = new HashMap<Integer,Double[]>();
175 atomser2resser = new HashMap<Integer,Integer>();
176
177 MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db);
178 ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData();
179 sequence = mymsdsdinfo.read_seq();
180 chain = mymsdsdinfo.chain;
181 mymsdsdinfo.close();
182
183 for (ArrayList result:resultset){
184 int atomserial = (Integer) result.get(0);
185 String atom = (String) result.get(1);
186 String res_type = (String) result.get(2);
187 int res_serial = (Integer) result.get(3);
188 double x = (Double) result.get(4);
189 double y = (Double) result.get(5);
190 double z = (Double) result.get(6);
191 Double[] coords = {x, y, z};
192 ArrayList<String> aalist=AA.aas();
193 if (aalist.contains(res_type)) {
194 atomser2coord.put(atomserial, coords);
195 atomser2resser.put(atomserial, res_serial);
196 resser2restype.put(res_serial, res_type);
197 ArrayList<String> atomlist = aas2atoms.get(res_type);
198 if (atomlist.contains(atom)){
199 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
200 }
201 }
202 }
203 }
204
205 /**
206 * To read the pdb data (atom coordinates, residue serials, atom serials) from file
207 * chain gets set to internal identifier: if input chain code NULL then chain will be ' '
208 * accode gets set to the one parsed in HEADER or to 'Unknown' if not found
209 * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
210 * @param pdbfile
211 * @throws FileNotFoundException
212 * @throws IOException
213 */
214 public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{
215 resser_atom2atomserial = new HashMap<String,Integer>();
216 resser2restype = new HashMap<Integer,String>();
217 atomser2coord = new HashMap<Integer,Double[]>();
218 atomser2resser = new HashMap<Integer,Integer>();
219 boolean empty = true; // controls whether we don't find any atom line for given chaincode and model
220 // we set chain to chaincode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
221 chain=chaincode;
222 if (chaincode.equals("NULL")) chain=" ";
223 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
224 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
225 String line;
226 while ((line = fpdb.readLine() ) != null ) {
227 Pattern p = Pattern.compile("^HEADER");
228 Matcher m = p.matcher(line);
229 if (m.find()){
230 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
231 Matcher mh = ph.matcher(line);
232 if (mh.find()) {
233 accode=mh.group(1).toLowerCase();
234 } else {
235 accode="Unknown";
236 }
237 }
238 p = Pattern.compile("^MODEL\\s+(\\d+)");
239 m = p.matcher(line);
240 if (m.find()){
241 thismodel=Integer.parseInt(m.group(1));
242 }
243 if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
244 p = Pattern.compile("^ATOM");
245 m = p.matcher(line);
246 if (m.find()){
247 // serial atom res_type chain res_ser x y z
248 Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chain+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
249 Matcher ml = pl.matcher(line);
250 if (ml.find()) {
251 empty=false;
252 int atomserial=Integer.parseInt(ml.group(1).trim());
253 String atom = ml.group(2).trim();
254 String res_type = ml.group(3).trim();
255 int res_serial = Integer.parseInt(ml.group(4).trim());
256 double x = Double.parseDouble(ml.group(5).trim());
257 double y = Double.parseDouble(ml.group(6).trim());
258 double z = Double.parseDouble(ml.group(7).trim());
259 Double[] coords = {x, y, z};
260 ArrayList<String> aalist=AA.aas();
261 if (aalist.contains(res_type)) {
262 atomser2coord.put(atomserial, coords);
263 atomser2resser.put(atomserial, res_serial);
264 resser2restype.put(res_serial, res_type);
265 ArrayList<String> atomlist = aas2atoms.get(res_type);
266 if (atomlist.contains(atom)){
267 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
268 }
269 }
270 }
271 }
272 }
273 fpdb.close();
274 if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model);
275 // now we read the sequence from the resser2restype HashMap
276 // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
277 ArrayList<Integer> ressers = new ArrayList<Integer>();
278 for (int resser:resser2restype.keySet()) {
279 ressers.add(resser);
280 }
281 Collections.sort(ressers);
282 sequence="";
283 for (int resser:ressers){
284 String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
285 sequence += oneletter;
286 }
287 }
288
289 /**
290 * Dumps coordinate data into a file in pdb format (ATOM lines only)
291 * The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects
292 * @param outfile
293 * @throws IOException
294 */
295 public void dump2pdbfile(String outfile) throws IOException {
296 TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>();
297 PrintStream Out = new PrintStream(new FileOutputStream(outfile));
298 Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'");
299 for (String resser_atom:resser_atom2atomserial.keySet()){
300 int atomserial = resser_atom2atomserial.get(resser_atom);
301 int res_serial = Integer.parseInt(resser_atom.split("_")[0]);
302 String atom = resser_atom.split("_")[1];
303 String res_type = resser2restype.get(res_serial);
304 Double[] coords = atomser2coord.get(atomserial);
305 Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]};
306 lines.put(atomserial, fields);
307 }
308 for (int atomserial:lines.keySet()){
309 Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial));
310 }
311 Out.println("END");
312 Out.close();
313 }
314
315 public void dumpseq(String seqfile) throws IOException {
316 PrintStream Out = new PrintStream(new FileOutputStream(seqfile));
317 Out.println(">"+accode+"_"+chaincode);
318 Out.println(sequence);
319 Out.close();
320 }
321
322 public int get_length(){
323 return resser2restype.size();
324 }
325
326 public HashMap<Integer,Double[]> get_coords_for_ct(String ct) {
327 HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>();
328 HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct);
329 for (int resser:resser2restype.keySet()){
330 String[] atoms = restype2atoms.get(resser2restype.get(resser));
331 for (String atom:atoms){
332 if (resser_atom2atomserial.containsKey(resser+"_"+atom)){
333 int atomser = resser_atom2atomserial.get(resser+"_"+atom);
334 Double[] coord = atomser2coord.get(atomser);
335 coords.put(atomser, coord);
336 }
337 else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){
338 int atomser = resser_atom2atomserial.get(resser+"_"+"OXT");
339 Double[] coord = atomser2coord.get(atomser);
340 coords.put(atomser, coord);
341 }
342 else {
343 System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser.");
344 }
345 }
346 }
347 return coords;
348 }
349
350 public TreeMap<Contact, Double> calculate_dist_matrix(String ct){
351 TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>();
352 if (!ct.contains("/")){
353 HashMap<Integer,Double[]> coords = get_coords_for_ct(ct);
354 for (int i_atomser:coords.keySet()){
355 for (int j_atomser:coords.keySet()){
356 if (j_atomser>i_atomser) {
357 Contact pair = new Contact(i_atomser,j_atomser);
358 dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser)));
359 }
360 }
361 }
362 } else {
363 String i_ct = ct.split("/")[0];
364 String j_ct = ct.split("/")[1];
365 HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct);
366 HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct);
367 for (int i_atomser:i_coords.keySet()){
368 for (int j_atomser:j_coords.keySet()){
369 if (j_atomser!=i_atomser){
370 Contact pair = new Contact(i_atomser,j_atomser);
371 dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser)));
372 }
373 }
374 }
375 }
376 return dist_matrix;
377 }
378
379 /**
380 * Get the contacts for given contact type and cutoff for this Pdb object.
381 * Returns a Graph object with the contacts
382 * The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy
383 * ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb)
384 * @param ct
385 * @param cutoff
386 * @return
387 */
388 public Graph get_graph(String ct, double cutoff){
389 TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct);
390 ContactList contacts = new ContactList();
391 // we loop here over all indices of dist_matrix,
392 // we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and
393 // in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix
394 for (Contact pair:dist_matrix.keySet()){
395 int i_atomser=pair.i;
396 int j_atomser=pair.j;
397 if (dist_matrix.get(pair)<=cutoff){
398 int i_resser = atomser2resser.get(i_atomser);
399 int j_resser = atomser2resser.get(j_atomser);
400 Contact resser_pair = new Contact(i_resser,j_resser);
401 // for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates
402 if (i_resser!=j_resser && (! contacts.contains(resser_pair))){
403 contacts.add(resser_pair);
404 }
405 }
406 }
407 Collections.sort(contacts);
408 TreeMap<Integer,String> nodes = new TreeMap<Integer,String>();
409 for (int resser:resser2restype.keySet()){
410 nodes.put(resser,resser2restype.get(resser));
411 }
412 Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain,chaincode);
413 return graph;
414 }
415
416 public Double distance(Double[] coords1, Double[] coords2){
417 return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2));
418 }
419 }