ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Pdb.java
Revision: 154
Committed: Fri May 18 09:32:28 2007 UTC (17 years, 9 months ago) by duarte
File size: 16929 byte(s)
Log Message:
Recoded chain reading in read_pdb_data_from_file to make it clearer, no change in functionality
Line User Rev File contents
1 duarte 123 package proteinstructure;
2    
3 duarte 127 import java.io.BufferedReader;
4     import java.io.File;
5 duarte 123 import java.io.FileOutputStream;
6 duarte 127 import java.io.FileReader;
7 duarte 123 import java.io.PrintStream;
8 duarte 127 import java.io.FileNotFoundException;
9 duarte 123 import java.io.IOException;
10     import java.util.HashMap;
11     import java.util.TreeMap;
12     import java.util.ArrayList;
13     import java.util.Collections;
14 duarte 127 import java.util.regex.Matcher;
15     import java.util.regex.Pattern;
16 duarte 123
17     public class Pdb {
18    
19 duarte 153 public final static int DEFAULT_MODEL=1;
20    
21 duarte 123 HashMap<String,Integer> resser_atom2atomserial;
22     HashMap<Integer,String> resser2restype;
23     HashMap<Integer,Double[]> atomser2coord;
24     HashMap<Integer,Integer> atomser2resser;
25     HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms();
26 duarte 145 public String sequence="";
27 stehr 140 public String accode="";
28 duarte 135 // given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file)
29 duarte 145 public String chaincode="";
30     public int model=DEFAULT_MODEL;
31 duarte 123 String db;
32 duarte 135
33     // Our internal chain identifier (taken from dbs or pdb):
34     // - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd)
35 duarte 153 // - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ')
36 stehr 137 public String chain;
37 duarte 123
38 duarte 153 /**
39     * Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL
40     * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
41     * @param accode
42     * @param chaincode
43     * @throws PdbaseInconsistencyError
44     * @throws PdbaseAcCodeNotFoundError
45     * @throws MsdsdAcCodeNotFoundError
46     * @throws MsdsdInconsistentResidueNumbersError
47     */
48 duarte 130 public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
49 duarte 133 this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB);
50 duarte 123 }
51 duarte 133
52 duarte 153 /**
53     * Constructs Pdb object given accession code, pdb chain code and a model serial
54     * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
55     * @param accode
56     * @param chaincode
57     * @param model_serial
58     * @throws PdbaseInconsistencyError
59     * @throws PdbaseAcCodeNotFoundError
60     * @throws MsdsdAcCodeNotFoundError
61     * @throws MsdsdInconsistentResidueNumbersError
62     */
63 duarte 133 public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
64     this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB);
65     }
66 duarte 153
67     /**
68     * Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL
69     * The db can be pdbase-like or msdsd-like
70     * @param accode
71     * @param chaincode
72     * @param db
73     * @throws PdbaseInconsistencyError
74     * @throws PdbaseAcCodeNotFoundError
75     * @throws MsdsdAcCodeNotFoundError
76     * @throws MsdsdInconsistentResidueNumbersError
77     */
78 duarte 130 public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
79 duarte 133 this(accode,chaincode,DEFAULT_MODEL,db);
80     }
81    
82 duarte 153 /**
83     * Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data.
84     * The db can be pdbase-like or msdsd-like
85     * @param accode
86     * @param chaincode
87     * @param model_serial
88     * @param db
89     * @throws PdbaseInconsistencyError
90     * @throws PdbaseAcCodeNotFoundError
91     * @throws MsdsdAcCodeNotFoundError
92     * @throws MsdsdInconsistentResidueNumbersError
93     */
94 duarte 133 public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
95 duarte 123 this.accode=accode;
96     this.chaincode=chaincode;
97 duarte 133 this.model=model_serial;
98 duarte 123 this.db=db;
99 duarte 130 // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode
100     this.chain=chaincode;
101     if (db.contains("msdsd")){
102     read_pdb_data_from_msdsd(db);
103     } else if (db.contains("pdbase")){
104     read_pdb_data_from_pdbase(db);
105     }
106 duarte 123 }
107    
108 duarte 153 /**
109     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
110     * @param pdbfile
111     * @param chaincode
112     * @param dummy dummy parameter to distinguish the method from other with the same signature
113     * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
114     * @throws FileNotFoundException
115     * @throws IOException
116     */
117     public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException {
118     this(pdbfile,chaincode,DEFAULT_MODEL,dummy);
119 duarte 133 }
120 duarte 153
121     /**
122     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
123     * @param pdbfile
124     * @param chaincode
125     * @param model_serial
126     * @param dummy dummy parameter to distinguish the method from other with the same signature
127     * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
128     * @throws FileNotFoundException
129     * @throws IOException
130     */
131     public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException {
132 duarte 133 this.model=model_serial;
133 duarte 153 this.chaincode=chaincode;
134 duarte 127 read_pdb_data_from_file(pdbfile);
135 duarte 123 }
136 duarte 133
137 duarte 124 public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{
138 duarte 123 resser_atom2atomserial = new HashMap<String,Integer>();
139     resser2restype = new HashMap<Integer,String>();
140     atomser2coord = new HashMap<Integer,Double[]>();
141     atomser2resser = new HashMap<Integer,Integer>();
142    
143 duarte 133 PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db);
144 duarte 123 ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData();
145 duarte 135 chain = mypdbaseinfo.chain;
146 duarte 123 sequence = mypdbaseinfo.read_seq();
147     mypdbaseinfo.close();
148    
149     for (ArrayList result:resultset){
150     int atomserial = (Integer) result.get(0);
151     String atom = (String) result.get(1);
152     String res_type = (String) result.get(2);
153 duarte 135 int res_serial = (Integer) result.get(3);
154     double x = (Double) result.get(4);
155     double y = (Double) result.get(5);
156     double z = (Double) result.get(6);
157 duarte 123 Double[] coords = {x, y, z};
158     ArrayList<String> aalist=AA.aas();
159     if (aalist.contains(res_type)) {
160     atomser2coord.put(atomserial, coords);
161     atomser2resser.put(atomserial, res_serial);
162     resser2restype.put(res_serial, res_type);
163     ArrayList<String> atomlist = aas2atoms.get(res_type);
164     if (atomlist.contains(atom)){
165     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
166     }
167     }
168     }
169     }
170    
171 duarte 130 public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
172     resser_atom2atomserial = new HashMap<String,Integer>();
173     resser2restype = new HashMap<Integer,String>();
174     atomser2coord = new HashMap<Integer,Double[]>();
175     atomser2resser = new HashMap<Integer,Integer>();
176    
177 duarte 133 MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db);
178 duarte 130 ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData();
179     sequence = mymsdsdinfo.read_seq();
180 duarte 138 chain = mymsdsdinfo.chain;
181 duarte 130 mymsdsdinfo.close();
182    
183     for (ArrayList result:resultset){
184     int atomserial = (Integer) result.get(0);
185     String atom = (String) result.get(1);
186     String res_type = (String) result.get(2);
187     int res_serial = (Integer) result.get(3);
188     double x = (Double) result.get(4);
189     double y = (Double) result.get(5);
190     double z = (Double) result.get(6);
191     Double[] coords = {x, y, z};
192     ArrayList<String> aalist=AA.aas();
193     if (aalist.contains(res_type)) {
194     atomser2coord.put(atomserial, coords);
195     atomser2resser.put(atomserial, res_serial);
196     resser2restype.put(res_serial, res_type);
197     ArrayList<String> atomlist = aas2atoms.get(res_type);
198     if (atomlist.contains(atom)){
199     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
200     }
201     }
202     }
203     }
204    
205 duarte 133 /**
206     * To read the pdb data (atom coordinates, residue serials, atom serials) from file
207 duarte 153 * chain gets set to internal identifier: if input chain code NULL then chain will be ' '
208     * accode gets set to the one parsed in HEADER or to 'Unknown' if not found
209     * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
210 duarte 133 * @param pdbfile
211     * @throws FileNotFoundException
212     * @throws IOException
213     */
214 duarte 127 public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{
215     resser_atom2atomserial = new HashMap<String,Integer>();
216     resser2restype = new HashMap<Integer,String>();
217     atomser2coord = new HashMap<Integer,Double[]>();
218     atomser2resser = new HashMap<Integer,Integer>();
219 duarte 153 boolean empty = true; // controls whether we don't find any atom line for given chaincode and model
220 duarte 154 // we set chain to chaincode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
221     chain=chaincode;
222     if (chaincode.equals("NULL")) chain=" ";
223 duarte 153 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
224 duarte 127 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
225     String line;
226     while ((line = fpdb.readLine() ) != null ) {
227 duarte 153 Pattern p = Pattern.compile("^HEADER");
228 duarte 127 Matcher m = p.matcher(line);
229     if (m.find()){
230 duarte 153 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
231     Matcher mh = ph.matcher(line);
232     if (mh.find()) {
233     accode=mh.group(1).toLowerCase();
234     } else {
235     accode="Unknown";
236     }
237     }
238     p = Pattern.compile("^MODEL\\s+(\\d+)");
239     m = p.matcher(line);
240     if (m.find()){
241     thismodel=Integer.parseInt(m.group(1));
242     }
243     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
244     p = Pattern.compile("^ATOM");
245     m = p.matcher(line);
246     if (m.find()){
247 duarte 154 // serial atom res_type chain res_ser x y z
248     Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chain+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
249 duarte 127 Matcher ml = pl.matcher(line);
250     if (ml.find()) {
251 duarte 153 empty=false;
252 duarte 127 int atomserial=Integer.parseInt(ml.group(1).trim());
253     String atom = ml.group(2).trim();
254     String res_type = ml.group(3).trim();
255 duarte 154 int res_serial = Integer.parseInt(ml.group(4).trim());
256     double x = Double.parseDouble(ml.group(5).trim());
257     double y = Double.parseDouble(ml.group(6).trim());
258     double z = Double.parseDouble(ml.group(7).trim());
259 duarte 127 Double[] coords = {x, y, z};
260     ArrayList<String> aalist=AA.aas();
261     if (aalist.contains(res_type)) {
262     atomser2coord.put(atomserial, coords);
263     atomser2resser.put(atomserial, res_serial);
264     resser2restype.put(res_serial, res_type);
265     ArrayList<String> atomlist = aas2atoms.get(res_type);
266     if (atomlist.contains(atom)){
267     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
268     }
269 duarte 153 }
270     }
271 duarte 127 }
272     }
273     fpdb.close();
274 duarte 153 if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model);
275 duarte 127 // now we read the sequence from the resser2restype HashMap
276     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
277     ArrayList<Integer> ressers = new ArrayList<Integer>();
278     for (int resser:resser2restype.keySet()) {
279     ressers.add(resser);
280     }
281     Collections.sort(ressers);
282     sequence="";
283     for (int resser:ressers){
284     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
285     sequence += oneletter;
286     }
287     }
288    
289 duarte 153 /**
290     * Dumps coordinate data into a file in pdb format (ATOM lines only)
291     * The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects
292     * @param outfile
293     * @throws IOException
294     */
295 duarte 123 public void dump2pdbfile(String outfile) throws IOException {
296 duarte 130 TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>();
297 duarte 123 PrintStream Out = new PrintStream(new FileOutputStream(outfile));
298 duarte 153 Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'");
299 duarte 123 for (String resser_atom:resser_atom2atomserial.keySet()){
300     int atomserial = resser_atom2atomserial.get(resser_atom);
301     int res_serial = Integer.parseInt(resser_atom.split("_")[0]);
302     String atom = resser_atom.split("_")[1];
303     String res_type = resser2restype.get(res_serial);
304     Double[] coords = atomser2coord.get(atomserial);
305 duarte 153 Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]};
306 duarte 130 lines.put(atomserial, fields);
307 duarte 123 }
308 duarte 130 for (int atomserial:lines.keySet()){
309     Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial));
310     }
311 duarte 123 Out.println("END");
312     Out.close();
313     }
314    
315     public void dumpseq(String seqfile) throws IOException {
316     PrintStream Out = new PrintStream(new FileOutputStream(seqfile));
317     Out.println(">"+accode+"_"+chaincode);
318     Out.println(sequence);
319     Out.close();
320     }
321    
322     public int get_length(){
323     return resser2restype.size();
324     }
325    
326     public HashMap<Integer,Double[]> get_coords_for_ct(String ct) {
327     HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>();
328     HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct);
329     for (int resser:resser2restype.keySet()){
330     String[] atoms = restype2atoms.get(resser2restype.get(resser));
331     for (String atom:atoms){
332     if (resser_atom2atomserial.containsKey(resser+"_"+atom)){
333     int atomser = resser_atom2atomserial.get(resser+"_"+atom);
334     Double[] coord = atomser2coord.get(atomser);
335     coords.put(atomser, coord);
336     }
337     else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){
338     int atomser = resser_atom2atomserial.get(resser+"_"+"OXT");
339     Double[] coord = atomser2coord.get(atomser);
340     coords.put(atomser, coord);
341     }
342     else {
343     System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser.");
344     }
345     }
346     }
347     return coords;
348     }
349    
350     public TreeMap<Contact, Double> calculate_dist_matrix(String ct){
351     TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>();
352     if (!ct.contains("/")){
353     HashMap<Integer,Double[]> coords = get_coords_for_ct(ct);
354     for (int i_atomser:coords.keySet()){
355     for (int j_atomser:coords.keySet()){
356     if (j_atomser>i_atomser) {
357     Contact pair = new Contact(i_atomser,j_atomser);
358     dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser)));
359     }
360     }
361     }
362     } else {
363     String i_ct = ct.split("/")[0];
364     String j_ct = ct.split("/")[1];
365     HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct);
366     HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct);
367     for (int i_atomser:i_coords.keySet()){
368     for (int j_atomser:j_coords.keySet()){
369     if (j_atomser!=i_atomser){
370     Contact pair = new Contact(i_atomser,j_atomser);
371     dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser)));
372     }
373     }
374     }
375     }
376     return dist_matrix;
377     }
378    
379 duarte 129 /**
380     * Get the contacts for given contact type and cutoff for this Pdb object.
381     * Returns a Graph object with the contacts
382     * The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy
383     * ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb)
384     * @param ct
385     * @param cutoff
386     * @return
387     */
388 duarte 123 public Graph get_graph(String ct, double cutoff){
389     TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct);
390     ArrayList<Contact> contacts = new ArrayList<Contact>();
391 duarte 129 // we loop here over all indices of dist_matrix,
392     // we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and
393     // in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix
394 duarte 123 for (Contact pair:dist_matrix.keySet()){
395     int i_atomser=pair.i;
396     int j_atomser=pair.j;
397     if (dist_matrix.get(pair)<=cutoff){
398     int i_resser = atomser2resser.get(i_atomser);
399     int j_resser = atomser2resser.get(j_atomser);
400     Contact resser_pair = new Contact(i_resser,j_resser);
401 duarte 129 // for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates
402 duarte 123 if (i_resser!=j_resser && (! contacts.contains(resser_pair))){
403     contacts.add(resser_pair);
404     }
405     }
406     }
407     Collections.sort(contacts);
408 duarte 129 TreeMap<Integer,String> nodes = new TreeMap<Integer,String>();
409     for (int resser:resser2restype.keySet()){
410     nodes.put(resser,resser2restype.get(resser));
411     }
412 duarte 135 // NOTE: we pass to the graph object the chain (internal chain identifier) not the pdb chain code!
413 duarte 129 Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain);
414 duarte 123 return graph;
415     }
416    
417     public Double distance(Double[] coords1, Double[] coords2){
418     return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2));
419     }
420     }