ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Pdb.java
Revision: 190
Committed: Tue Jun 12 13:38:46 2007 UTC (17 years, 3 months ago) by duarte
File size: 17584 byte(s)
Log Message:
NEW FEATURE: mapping of pdb residue serials and internal serials in the Pdb object

Line User Rev File contents
1 duarte 123 package proteinstructure;
2    
3 duarte 127 import java.io.BufferedReader;
4     import java.io.File;
5 duarte 123 import java.io.FileOutputStream;
6 duarte 127 import java.io.FileReader;
7 duarte 123 import java.io.PrintStream;
8 duarte 127 import java.io.FileNotFoundException;
9 duarte 123 import java.io.IOException;
10     import java.util.HashMap;
11     import java.util.TreeMap;
12     import java.util.ArrayList;
13     import java.util.Collections;
14 duarte 127 import java.util.regex.Matcher;
15     import java.util.regex.Pattern;
16 duarte 123
17     public class Pdb {
18    
19 duarte 153 public final static int DEFAULT_MODEL=1;
20    
21 duarte 123 HashMap<String,Integer> resser_atom2atomserial;
22     HashMap<Integer,String> resser2restype;
23     HashMap<Integer,Double[]> atomser2coord;
24     HashMap<Integer,Integer> atomser2resser;
25 duarte 190 HashMap<String,Integer> pdbresser2resser; // only used when reading from database
26     HashMap<Integer,String> resser2pdbresser; // only used when reading from database
27    
28 duarte 123 HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms();
29 duarte 145 public String sequence="";
30 stehr 140 public String accode="";
31 duarte 135 // given "external" pdb chain code, i.e. the classic pdb code ("NULL" if it is blank in original pdb file)
32 duarte 145 public String chaincode="";
33     public int model=DEFAULT_MODEL;
34 duarte 123 String db;
35 duarte 135
36     // Our internal chain identifier (taken from dbs or pdb):
37     // - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd)
38 duarte 153 // - in reading from pdb file it gets set to whatever parsed from the pdb file (i.e. can be also ' ')
39 stehr 137 public String chain;
40 duarte 123
41 duarte 153 /**
42     * Constructs Pdb object given accession code and pdb chain code. Model will be DEFAULT_MODEL
43     * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
44     * @param accode
45     * @param chaincode
46     * @throws PdbaseInconsistencyError
47     * @throws PdbaseAcCodeNotFoundError
48     * @throws MsdsdAcCodeNotFoundError
49     * @throws MsdsdInconsistentResidueNumbersError
50     */
51 duarte 130 public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
52 duarte 133 this(accode,chaincode,DEFAULT_MODEL,PdbaseInfo.pdbaseDB);
53 duarte 123 }
54 duarte 133
55 duarte 153 /**
56     * Constructs Pdb object given accession code, pdb chain code and a model serial
57     * Takes data from default pdbase-like database: PdbaseInfo.pdbaseDB
58     * @param accode
59     * @param chaincode
60     * @param model_serial
61     * @throws PdbaseInconsistencyError
62     * @throws PdbaseAcCodeNotFoundError
63     * @throws MsdsdAcCodeNotFoundError
64     * @throws MsdsdInconsistentResidueNumbersError
65     */
66 duarte 133 public Pdb (String accode, String chaincode, int model_serial) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
67     this(accode,chaincode,model_serial,PdbaseInfo.pdbaseDB);
68     }
69 duarte 153
70     /**
71     * Constructs Pdb object given accession code, pdb chain code and a source db for the data. Model will be DEFAULT_MODEL
72     * The db can be pdbase-like or msdsd-like
73     * @param accode
74     * @param chaincode
75     * @param db
76     * @throws PdbaseInconsistencyError
77     * @throws PdbaseAcCodeNotFoundError
78     * @throws MsdsdAcCodeNotFoundError
79     * @throws MsdsdInconsistentResidueNumbersError
80     */
81 duarte 130 public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
82 duarte 133 this(accode,chaincode,DEFAULT_MODEL,db);
83     }
84    
85 duarte 153 /**
86     * Constructs Pdb object given accession code, pdb chain code, model serial and a source db for the data.
87     * The db can be pdbase-like or msdsd-like
88     * @param accode
89     * @param chaincode
90     * @param model_serial
91     * @param db
92     * @throws PdbaseInconsistencyError
93     * @throws PdbaseAcCodeNotFoundError
94     * @throws MsdsdAcCodeNotFoundError
95     * @throws MsdsdInconsistentResidueNumbersError
96     */
97 duarte 133 public Pdb (String accode, String chaincode, int model_serial, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError, MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
98 duarte 123 this.accode=accode;
99     this.chaincode=chaincode;
100 duarte 133 this.model=model_serial;
101 duarte 123 this.db=db;
102 duarte 130 // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id. If reading msdsd it stays as chaincode
103     this.chain=chaincode;
104     if (db.contains("msdsd")){
105     read_pdb_data_from_msdsd(db);
106     } else if (db.contains("pdbase")){
107     read_pdb_data_from_pdbase(db);
108     }
109 duarte 190 // we initialise resser2pdbresser from the pdbresser2resser HashMap
110     resser2pdbresser = new HashMap<Integer, String>();
111     for (String pdbresser:pdbresser2resser.keySet()){
112     resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
113     }
114 duarte 123 }
115    
116 duarte 153 /**
117     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
118     * @param pdbfile
119     * @param chaincode
120     * @param dummy dummy parameter to distinguish the method from other with the same signature
121     * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
122     * @throws FileNotFoundException
123     * @throws IOException
124     */
125     public Pdb (String pdbfile, String chaincode, boolean dummy) throws FileNotFoundException, IOException {
126     this(pdbfile,chaincode,DEFAULT_MODEL,dummy);
127 duarte 133 }
128 duarte 153
129     /**
130     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
131     * @param pdbfile
132     * @param chaincode
133     * @param model_serial
134     * @param dummy dummy parameter to distinguish the method from other with the same signature
135     * TODO dummy parameter is a dirty hack, must solve it in other way: subclassing
136     * @throws FileNotFoundException
137     * @throws IOException
138     */
139     public Pdb (String pdbfile, String chaincode, int model_serial, boolean dummy) throws FileNotFoundException, IOException {
140 duarte 133 this.model=model_serial;
141 duarte 153 this.chaincode=chaincode;
142 duarte 127 read_pdb_data_from_file(pdbfile);
143 duarte 123 }
144 duarte 133
145 duarte 124 public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{
146 duarte 123 resser_atom2atomserial = new HashMap<String,Integer>();
147     resser2restype = new HashMap<Integer,String>();
148     atomser2coord = new HashMap<Integer,Double[]>();
149     atomser2resser = new HashMap<Integer,Integer>();
150    
151 duarte 133 PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,model,db);
152 duarte 123 ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData();
153 duarte 135 chain = mypdbaseinfo.chain;
154 duarte 123 sequence = mypdbaseinfo.read_seq();
155 duarte 190 pdbresser2resser = mypdbaseinfo.get_ressers_mapping();
156 duarte 123 mypdbaseinfo.close();
157    
158     for (ArrayList result:resultset){
159     int atomserial = (Integer) result.get(0);
160     String atom = (String) result.get(1);
161     String res_type = (String) result.get(2);
162 duarte 135 int res_serial = (Integer) result.get(3);
163     double x = (Double) result.get(4);
164     double y = (Double) result.get(5);
165     double z = (Double) result.get(6);
166 duarte 123 Double[] coords = {x, y, z};
167     ArrayList<String> aalist=AA.aas();
168     if (aalist.contains(res_type)) {
169     atomser2coord.put(atomserial, coords);
170     atomser2resser.put(atomserial, res_serial);
171     resser2restype.put(res_serial, res_type);
172     ArrayList<String> atomlist = aas2atoms.get(res_type);
173     if (atomlist.contains(atom)){
174     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
175     }
176     }
177     }
178     }
179    
180 duarte 130 public void read_pdb_data_from_msdsd(String db) throws MsdsdAcCodeNotFoundError, MsdsdInconsistentResidueNumbersError {
181     resser_atom2atomserial = new HashMap<String,Integer>();
182     resser2restype = new HashMap<Integer,String>();
183     atomser2coord = new HashMap<Integer,Double[]>();
184     atomser2resser = new HashMap<Integer,Integer>();
185    
186 duarte 133 MsdsdInfo mymsdsdinfo = new MsdsdInfo(accode,chaincode,model,db);
187 duarte 130 ArrayList<ArrayList> resultset = mymsdsdinfo.read_atomData();
188     sequence = mymsdsdinfo.read_seq();
189 duarte 138 chain = mymsdsdinfo.chain;
190 duarte 190 pdbresser2resser = mymsdsdinfo.get_ressers_mapping();
191 duarte 130 mymsdsdinfo.close();
192    
193     for (ArrayList result:resultset){
194     int atomserial = (Integer) result.get(0);
195     String atom = (String) result.get(1);
196     String res_type = (String) result.get(2);
197     int res_serial = (Integer) result.get(3);
198     double x = (Double) result.get(4);
199     double y = (Double) result.get(5);
200     double z = (Double) result.get(6);
201     Double[] coords = {x, y, z};
202     ArrayList<String> aalist=AA.aas();
203     if (aalist.contains(res_type)) {
204     atomser2coord.put(atomserial, coords);
205     atomser2resser.put(atomserial, res_serial);
206     resser2restype.put(res_serial, res_type);
207     ArrayList<String> atomlist = aas2atoms.get(res_type);
208     if (atomlist.contains(atom)){
209     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
210     }
211     }
212     }
213     }
214    
215 duarte 133 /**
216     * To read the pdb data (atom coordinates, residue serials, atom serials) from file
217 duarte 153 * chain gets set to internal identifier: if input chain code NULL then chain will be ' '
218     * accode gets set to the one parsed in HEADER or to 'Unknown' if not found
219     * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
220 duarte 133 * @param pdbfile
221     * @throws FileNotFoundException
222     * @throws IOException
223     */
224 duarte 127 public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{
225     resser_atom2atomserial = new HashMap<String,Integer>();
226     resser2restype = new HashMap<Integer,String>();
227     atomser2coord = new HashMap<Integer,Double[]>();
228     atomser2resser = new HashMap<Integer,Integer>();
229 duarte 153 boolean empty = true; // controls whether we don't find any atom line for given chaincode and model
230 duarte 154 // we set chain to chaincode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
231     chain=chaincode;
232     if (chaincode.equals("NULL")) chain=" ";
233 duarte 153 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
234 duarte 127 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
235     String line;
236     while ((line = fpdb.readLine() ) != null ) {
237 duarte 153 Pattern p = Pattern.compile("^HEADER");
238 duarte 127 Matcher m = p.matcher(line);
239     if (m.find()){
240 duarte 153 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
241     Matcher mh = ph.matcher(line);
242     if (mh.find()) {
243     accode=mh.group(1).toLowerCase();
244     } else {
245     accode="Unknown";
246     }
247     }
248     p = Pattern.compile("^MODEL\\s+(\\d+)");
249     m = p.matcher(line);
250     if (m.find()){
251     thismodel=Integer.parseInt(m.group(1));
252     }
253     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
254     p = Pattern.compile("^ATOM");
255     m = p.matcher(line);
256     if (m.find()){
257 duarte 154 // serial atom res_type chain res_ser x y z
258     Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chain+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
259 duarte 127 Matcher ml = pl.matcher(line);
260     if (ml.find()) {
261 duarte 153 empty=false;
262 duarte 127 int atomserial=Integer.parseInt(ml.group(1).trim());
263     String atom = ml.group(2).trim();
264     String res_type = ml.group(3).trim();
265 duarte 154 int res_serial = Integer.parseInt(ml.group(4).trim());
266     double x = Double.parseDouble(ml.group(5).trim());
267     double y = Double.parseDouble(ml.group(6).trim());
268     double z = Double.parseDouble(ml.group(7).trim());
269 duarte 127 Double[] coords = {x, y, z};
270     ArrayList<String> aalist=AA.aas();
271     if (aalist.contains(res_type)) {
272     atomser2coord.put(atomserial, coords);
273     atomser2resser.put(atomserial, res_serial);
274     resser2restype.put(res_serial, res_type);
275     ArrayList<String> atomlist = aas2atoms.get(res_type);
276     if (atomlist.contains(atom)){
277     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
278     }
279 duarte 153 }
280     }
281 duarte 127 }
282     }
283     fpdb.close();
284 duarte 153 if (empty) System.err.println("Couldn't find any atom line for given chaincode: "+chaincode+", model: "+model);
285 duarte 127 // now we read the sequence from the resser2restype HashMap
286     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
287     ArrayList<Integer> ressers = new ArrayList<Integer>();
288     for (int resser:resser2restype.keySet()) {
289     ressers.add(resser);
290     }
291     Collections.sort(ressers);
292     sequence="";
293     for (int resser:ressers){
294     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
295     sequence += oneletter;
296     }
297     }
298    
299 duarte 153 /**
300     * Dumps coordinate data into a file in pdb format (ATOM lines only)
301     * The chain dumped is the value of the chain field, i.e. our internal chain identifier for Pdb objects
302     * @param outfile
303     * @throws IOException
304     */
305 duarte 123 public void dump2pdbfile(String outfile) throws IOException {
306 duarte 130 TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>();
307 duarte 123 PrintStream Out = new PrintStream(new FileOutputStream(outfile));
308 duarte 153 Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", chain='"+chain+"'");
309 duarte 123 for (String resser_atom:resser_atom2atomserial.keySet()){
310     int atomserial = resser_atom2atomserial.get(resser_atom);
311     int res_serial = Integer.parseInt(resser_atom.split("_")[0]);
312     String atom = resser_atom.split("_")[1];
313     String res_type = resser2restype.get(res_serial);
314     Double[] coords = atomser2coord.get(atomserial);
315 duarte 153 Object[] fields = {atomserial, atom, res_type, chain, res_serial, coords[0], coords[1], coords[2]};
316 duarte 130 lines.put(atomserial, fields);
317 duarte 123 }
318 duarte 130 for (int atomserial:lines.keySet()){
319     Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial));
320     }
321 duarte 123 Out.println("END");
322     Out.close();
323     }
324    
325     public void dumpseq(String seqfile) throws IOException {
326     PrintStream Out = new PrintStream(new FileOutputStream(seqfile));
327     Out.println(">"+accode+"_"+chaincode);
328     Out.println(sequence);
329     Out.close();
330     }
331    
332     public int get_length(){
333     return resser2restype.size();
334     }
335    
336     public HashMap<Integer,Double[]> get_coords_for_ct(String ct) {
337     HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>();
338     HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct);
339     for (int resser:resser2restype.keySet()){
340     String[] atoms = restype2atoms.get(resser2restype.get(resser));
341     for (String atom:atoms){
342     if (resser_atom2atomserial.containsKey(resser+"_"+atom)){
343     int atomser = resser_atom2atomserial.get(resser+"_"+atom);
344     Double[] coord = atomser2coord.get(atomser);
345     coords.put(atomser, coord);
346     }
347     else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){
348     int atomser = resser_atom2atomserial.get(resser+"_"+"OXT");
349     Double[] coord = atomser2coord.get(atomser);
350     coords.put(atomser, coord);
351     }
352     else {
353     System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser.");
354     }
355     }
356     }
357     return coords;
358     }
359    
360     public TreeMap<Contact, Double> calculate_dist_matrix(String ct){
361     TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>();
362     if (!ct.contains("/")){
363     HashMap<Integer,Double[]> coords = get_coords_for_ct(ct);
364     for (int i_atomser:coords.keySet()){
365     for (int j_atomser:coords.keySet()){
366     if (j_atomser>i_atomser) {
367     Contact pair = new Contact(i_atomser,j_atomser);
368     dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser)));
369     }
370     }
371     }
372     } else {
373     String i_ct = ct.split("/")[0];
374     String j_ct = ct.split("/")[1];
375     HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct);
376     HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct);
377     for (int i_atomser:i_coords.keySet()){
378     for (int j_atomser:j_coords.keySet()){
379     if (j_atomser!=i_atomser){
380     Contact pair = new Contact(i_atomser,j_atomser);
381     dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser)));
382     }
383     }
384     }
385     }
386     return dist_matrix;
387     }
388    
389 duarte 129 /**
390     * Get the contacts for given contact type and cutoff for this Pdb object.
391     * Returns a Graph object with the contacts
392     * The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy
393     * ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb)
394     * @param ct
395     * @param cutoff
396     * @return
397     */
398 duarte 190 @SuppressWarnings("unchecked")
399 duarte 123 public Graph get_graph(String ct, double cutoff){
400     TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct);
401 duarte 175 ContactList contacts = new ContactList();
402 duarte 129 // we loop here over all indices of dist_matrix,
403     // we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and
404     // in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix
405 duarte 123 for (Contact pair:dist_matrix.keySet()){
406     int i_atomser=pair.i;
407     int j_atomser=pair.j;
408     if (dist_matrix.get(pair)<=cutoff){
409     int i_resser = atomser2resser.get(i_atomser);
410     int j_resser = atomser2resser.get(j_atomser);
411     Contact resser_pair = new Contact(i_resser,j_resser);
412 duarte 129 // for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates
413 duarte 123 if (i_resser!=j_resser && (! contacts.contains(resser_pair))){
414     contacts.add(resser_pair);
415     }
416     }
417     }
418     Collections.sort(contacts);
419 duarte 129 TreeMap<Integer,String> nodes = new TreeMap<Integer,String>();
420     for (int resser:resser2restype.keySet()){
421     nodes.put(resser,resser2restype.get(resser));
422     }
423 duarte 156 Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain,chaincode);
424 duarte 123 return graph;
425     }
426    
427     public Double distance(Double[] coords1, Double[] coords2){
428     return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2));
429     }
430 duarte 190
431     public int get_resser_from_pdbresser (String pdbresser){
432     return pdbresser2resser.get(pdbresser);
433     }
434    
435     public String get_pdbresser_from_resser (int resser){
436     return resser2pdbresser.get(resser);
437     }
438    
439 duarte 123 }