ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Pdb.java
Revision: 153
Committed: Wed May 16 17:22:08 2007 UTC (17 years, 4 months ago) by duarte
File size: 16991 byte(s)
Log Message:
NEW FUNCTIONALITY: Reading from pdb file given chain code or model serial fully implemented
Changed the chain that is dumped in dump2pdbfile directly to the Pdb.chain field i.e. internal chain identifier
Loads of new comments, including java doc for all constructors
Line User Rev File contents
1 duarte 123 package proteinstructure;
2    
3 duarte 127 import java.io.BufferedReader;
4     import java.io.File;
5 duarte 123 import java.io.FileOutputStream;
6 duarte 127 import java.io.FileReader;
7 duarte 123 import java.io.PrintStream;
8 duarte 127 import java.io.FileNotFoundException;
9 duarte 123 import java.io.IOException;
10     import java.util.HashMap;
11     import java.util.TreeMap;
12     import java.util.ArrayList;
13     import java.util.Collections;
14 duarte 127 import java.util.regex.Matcher;
15     import java.util.regex.Pattern;
16 duarte 123
17     public class Pdb {
18    
19 duarte 153 public final static int DEFAULT_MODEL=1;
20    
21 duarte 123 HashMap<String,Integer> resser_atom2atomserial;
22     HashMap<Integer,String> resser2restype;
23     HashMap<Integer,Double[]> atomser2coord;
24     HashMap<Integer,Integer> atomser2resser;
25     HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms();
26 duarte 145 public String sequence="";
27 stehr 140 public String accode="";
28 duarte 135 // given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file)
29 duarte 145 public String chaincode="";
30     public int model=DEFAULT_MODEL;
31 duarte 123 String db;
32 duarte 135
33     // Our internal chain identifier (taken from dbs or pdb):
34     // - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd)
35 duarte 153 // - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ')
36 stehr 137 public String chain;
37 duarte 123
38 duarte 153 /**
39     * Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL
40     * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
41     * @param accode
42     * @param chaincode
43     * @throws PdbaseInconsistencyError
44     * @throws PdbaseAcCodeNotFoundError
45     * @throws MsdsdAcCodeNotFoundError
46     * @throws MsdsdInconsistentResidueNumbersError
47     */
48 duarte 130 public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
49 duarte 133 this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB);
50 duarte 123 }
51 duarte 133
52 duarte 153 /**
53     * Constructs Pdb object given accession code, pdb chain code and a model serial
54     * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
55     * @param accode
56     * @param chaincode
57     * @param model_serial
58     * @throws PdbaseInconsistencyError
59     * @throws PdbaseAcCodeNotFoundError
60     * @throws MsdsdAcCodeNotFoundError
61     * @throws MsdsdInconsistentResidueNumbersError
62     */
63 duarte 133 public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
64     this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB);
65     }
66 duarte 153
67     /**
68     * Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL
69     * The db can be pdbase-like or msdsd-like
70     * @param accode
71     * @param chaincode
72     * @param db
73     * @throws PdbaseInconsistencyError
74     * @throws PdbaseAcCodeNotFoundError
75     * @throws MsdsdAcCodeNotFoundError
76     * @throws MsdsdInconsistentResidueNumbersError
77     */
78 duarte 130 public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
79 duarte 133 this(accode,chaincode,DEFAULT_MODEL,db);
80     }
81    
82 duarte 153 /**
83     * Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data.
84     * The db can be pdbase-like or msdsd-like
85     * @param accode
86     * @param chaincode
87     * @param model_serial
88     * @param db
89     * @throws PdbaseInconsistencyError
90     * @throws PdbaseAcCodeNotFoundError
91     * @throws MsdsdAcCodeNotFoundError
92     * @throws MsdsdInconsistentResidueNumbersError
93     */
94 duarte 133 public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
95 duarte 123 this.accode=accode;
96     this.chaincode=chaincode;
97 duarte 133 this.model=model_serial;
98 duarte 123 this.db=db;
99 duarte 130 // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode
100     this.chain=chaincode;
101     if (db.contains("msdsd")){
102     read_pdb_data_from_msdsd(db);
103     } else if (db.contains("pdbase")){
104     read_pdb_data_from_pdbase(db);
105     }
106 duarte 123 }
107    
108 duarte 153 /**
109     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
110     * @param pdbfile
111     * @param chaincode
112     * @param dummy dummy parameter to distinguish the method from other with the same signature
113     * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
114     * @throws FileNotFoundException
115     * @throws IOException
116     */
117     public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException {
118     this(pdbfile,chaincode,DEFAULT_MODEL,dummy);
119 duarte 133 }
120 duarte 153
121     /**
122     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
123     * @param pdbfile
124     * @param chaincode
125     * @param model_serial
126     * @param dummy dummy parameter to distinguish the method from other with the same signature
127     * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
128     * @throws FileNotFoundException
129     * @throws IOException
130     */
131     public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException {
132 duarte 133 this.model=model_serial;
133 duarte 153 this.chaincode=chaincode;
134 duarte 127 read_pdb_data_from_file(pdbfile);
135 duarte 123 }
136 duarte 133
137 duarte 124 public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{
138 duarte 123 resser_atom2atomserial = new HashMap<String,Integer>();
139     resser2restype = new HashMap<Integer,String>();
140     atomser2coord = new HashMap<Integer,Double[]>();
141     atomser2resser = new HashMap<Integer,Integer>();
142    
143 duarte 133 PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db);
144 duarte 123 ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData();
145 duarte 135 chain = mypdbaseinfo.chain;
146 duarte 123 sequence = mypdbaseinfo.read_seq();
147     mypdbaseinfo.close();
148    
149     for (ArrayList result:resultset){
150     int atomserial = (Integer) result.get(0);
151     String atom = (String) result.get(1);
152     String res_type = (String) result.get(2);
153 duarte 135 int res_serial = (Integer) result.get(3);
154     double x = (Double) result.get(4);
155     double y = (Double) result.get(5);
156     double z = (Double) result.get(6);
157 duarte 123 Double[] coords = {x, y, z};
158     ArrayList<String> aalist=AA.aas();
159     if (aalist.contains(res_type)) {
160     atomser2coord.put(atomserial, coords);
161     atomser2resser.put(atomserial, res_serial);
162     resser2restype.put(res_serial, res_type);
163     ArrayList<String> atomlist = aas2atoms.get(res_type);
164     if (atomlist.contains(atom)){
165     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
166     }
167     }
168     }
169     }
170    
171 duarte 130 public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
172     resser_atom2atomserial = new HashMap<String,Integer>();
173     resser2restype = new HashMap<Integer,String>();
174     atomser2coord = new HashMap<Integer,Double[]>();
175     atomser2resser = new HashMap<Integer,Integer>();
176    
177 duarte 133 MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db);
178 duarte 130 ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData();
179     sequence = mymsdsdinfo.read_seq();
180 duarte 138 chain = mymsdsdinfo.chain;
181 duarte 130 mymsdsdinfo.close();
182    
183     for (ArrayList result:resultset){
184     int atomserial = (Integer) result.get(0);
185     String atom = (String) result.get(1);
186     String res_type = (String) result.get(2);
187     int res_serial = (Integer) result.get(3);
188     double x = (Double) result.get(4);
189     double y = (Double) result.get(5);
190     double z = (Double) result.get(6);
191     Double[] coords = {x, y, z};
192     ArrayList<String> aalist=AA.aas();
193     if (aalist.contains(res_type)) {
194     atomser2coord.put(atomserial, coords);
195     atomser2resser.put(atomserial, res_serial);
196     resser2restype.put(res_serial, res_type);
197     ArrayList<String> atomlist = aas2atoms.get(res_type);
198     if (atomlist.contains(atom)){
199     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
200     }
201     }
202     }
203     }
204    
205 duarte 133 /**
206     * To read the pdb data (atom coordinates, residue serials, atom serials) from file
207 duarte 153 * chain gets set to internal identifier: if input chain code NULL then chain will be ' '
208     * accode gets set to the one parsed in HEADER or to 'Unknown' if not found
209     * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
210 duarte 133 * @param pdbfile
211     * @throws FileNotFoundException
212     * @throws IOException
213     */
214 duarte 127 public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{
215     resser_atom2atomserial = new HashMap<String,Integer>();
216     resser2restype = new HashMap<Integer,String>();
217     atomser2coord = new HashMap<Integer,Double[]>();
218     atomser2resser = new HashMap<Integer,Integer>();
219 duarte 153 boolean empty = true; // controls whether we don't find any atom line for given chaincode and model
220     // NULL is a blank chain code in pdb files
221     String chaincodestr=chaincode;
222     if (chaincode.equals("NULL")) chaincodestr=" ";
223     int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
224 duarte 127 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
225     String line;
226     while ((line = fpdb.readLine() ) != null ) {
227 duarte 153 Pattern p = Pattern.compile("^HEADER");
228 duarte 127 Matcher m = p.matcher(line);
229     if (m.find()){
230 duarte 153 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
231     Matcher mh = ph.matcher(line);
232     if (mh.find()) {
233     accode=mh.group(1).toLowerCase();
234     } else {
235     accode="Unknown";
236     }
237     }
238     p = Pattern.compile("^MODEL\\s+(\\d+)");
239     m = p.matcher(line);
240     if (m.find()){
241     thismodel=Integer.parseInt(m.group(1));
242     }
243     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
244     p = Pattern.compile("^ATOM");
245     m = p.matcher(line);
246     if (m.find()){
247     // serial atom res_type chain res_ser x y z
248     Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}("+chaincodestr+")(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
249 duarte 127 Matcher ml = pl.matcher(line);
250     if (ml.find()) {
251 duarte 153 empty=false;
252 duarte 127 int atomserial=Integer.parseInt(ml.group(1).trim());
253     String atom = ml.group(2).trim();
254     String res_type = ml.group(3).trim();
255 duarte 153 chain = ml.group(4); //we don't trim as we want to keep " " as the NULL chain code
256 duarte 135 int res_serial = Integer.parseInt(ml.group(5).trim());
257     double x = Double.parseDouble(ml.group(6).trim());
258     double y = Double.parseDouble(ml.group(7).trim());
259     double z = Double.parseDouble(ml.group(8).trim());
260 duarte 127 Double[] coords = {x, y, z};
261     ArrayList<String> aalist=AA.aas();
262     if (aalist.contains(res_type)) {
263     atomser2coord.put(atomserial, coords);
264     atomser2resser.put(atomserial, res_serial);
265     resser2restype.put(res_serial, res_type);
266     ArrayList<String> atomlist = aas2atoms.get(res_type);
267     if (atomlist.contains(atom)){
268     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
269     }
270 duarte 153 }
271     }
272 duarte 127 }
273     }
274     fpdb.close();
275 duarte 153 if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model);
276 duarte 127 // now we read the sequence from the resser2restype HashMap
277     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
278     ArrayList<Integer> ressers = new ArrayList<Integer>();
279     for (int resser:resser2restype.keySet()) {
280     ressers.add(resser);
281     }
282     Collections.sort(ressers);
283     sequence="";
284     for (int resser:ressers){
285     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
286     sequence += oneletter;
287     }
288     }
289    
290 duarte 153 /**
291     * Dumps coordinate data into a file in pdb format (ATOM lines only)
292     * The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects
293     * @param outfile
294     * @throws IOException
295     */
296 duarte 123 public void dump2pdbfile(String outfile) throws IOException {
297 duarte 130 TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>();
298 duarte 123 PrintStream Out = new PrintStream(new FileOutputStream(outfile));
299 duarte 153 Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'");
300 duarte 123 for (String resser_atom:resser_atom2atomserial.keySet()){
301     int atomserial = resser_atom2atomserial.get(resser_atom);
302     int res_serial = Integer.parseInt(resser_atom.split("_")[0]);
303     String atom = resser_atom.split("_")[1];
304     String res_type = resser2restype.get(res_serial);
305     Double[] coords = atomser2coord.get(atomserial);
306 duarte 153 Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]};
307 duarte 130 lines.put(atomserial, fields);
308 duarte 123 }
309 duarte 130 for (int atomserial:lines.keySet()){
310     Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial));
311     }
312 duarte 123 Out.println("END");
313     Out.close();
314     }
315    
316     public void dumpseq(String seqfile) throws IOException {
317     PrintStream Out = new PrintStream(new FileOutputStream(seqfile));
318     Out.println(">"+accode+"_"+chaincode);
319     Out.println(sequence);
320     Out.close();
321     }
322    
323     public int get_length(){
324     return resser2restype.size();
325     }
326    
327     public HashMap<Integer,Double[]> get_coords_for_ct(String ct) {
328     HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>();
329     HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct);
330     for (int resser:resser2restype.keySet()){
331     String[] atoms = restype2atoms.get(resser2restype.get(resser));
332     for (String atom:atoms){
333     if (resser_atom2atomserial.containsKey(resser+"_"+atom)){
334     int atomser = resser_atom2atomserial.get(resser+"_"+atom);
335     Double[] coord = atomser2coord.get(atomser);
336     coords.put(atomser, coord);
337     }
338     else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){
339     int atomser = resser_atom2atomserial.get(resser+"_"+"OXT");
340     Double[] coord = atomser2coord.get(atomser);
341     coords.put(atomser, coord);
342     }
343     else {
344     System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser.");
345     }
346     }
347     }
348     return coords;
349     }
350    
351     public TreeMap<Contact, Double> calculate_dist_matrix(String ct){
352     TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>();
353     if (!ct.contains("/")){
354     HashMap<Integer,Double[]> coords = get_coords_for_ct(ct);
355     for (int i_atomser:coords.keySet()){
356     for (int j_atomser:coords.keySet()){
357     if (j_atomser>i_atomser) {
358     Contact pair = new Contact(i_atomser,j_atomser);
359     dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser)));
360     }
361     }
362     }
363     } else {
364     String i_ct = ct.split("/")[0];
365     String j_ct = ct.split("/")[1];
366     HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct);
367     HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct);
368     for (int i_atomser:i_coords.keySet()){
369     for (int j_atomser:j_coords.keySet()){
370     if (j_atomser!=i_atomser){
371     Contact pair = new Contact(i_atomser,j_atomser);
372     dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser)));
373     }
374     }
375     }
376     }
377     return dist_matrix;
378     }
379    
380 duarte 129 /**
381     * Get the contacts for given contact type and cutoff for this Pdb object.
382     * Returns a Graph object with the contacts
383     * The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy
384     * ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb)
385     * @param ct
386     * @param cutoff
387     * @return
388     */
389 duarte 123 public Graph get_graph(String ct, double cutoff){
390     TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct);
391     ArrayList<Contact> contacts = new ArrayList<Contact>();
392 duarte 129 // we loop here over all indices of dist_matrix,
393     // we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and
394     // in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix
395 duarte 123 for (Contact pair:dist_matrix.keySet()){
396     int i_atomser=pair.i;
397     int j_atomser=pair.j;
398     if (dist_matrix.get(pair)<=cutoff){
399     int i_resser = atomser2resser.get(i_atomser);
400     int j_resser = atomser2resser.get(j_atomser);
401     Contact resser_pair = new Contact(i_resser,j_resser);
402 duarte 129 // for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates
403 duarte 123 if (i_resser!=j_resser && (! contacts.contains(resser_pair))){
404     contacts.add(resser_pair);
405     }
406     }
407     }
408     Collections.sort(contacts);
409 duarte 129 TreeMap<Integer,String> nodes = new TreeMap<Integer,String>();
410     for (int resser:resser2restype.keySet()){
411     nodes.put(resser,resser2restype.get(resser));
412     }
413 duarte 135 // NOTE: we pass to the graph object the chain (internal chain identifier) not the pdb chain code!
414 duarte 129 Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain);
415 duarte 123 return graph;
416     }
417    
418     public Double distance(Double[] coords1, Double[] coords2){
419     return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2));
420     }
421     }