ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 326
Committed: Thu Sep 20 14:49:55 2007 UTC (17 years ago) by duarte
File size: 10907 byte(s)
Log Message:
Removed class AA and replace it by AAinfo, which reads contact types from separate file contactTypes.dat
New class ContactType which contains atoms for each contact type and residue. A static object for each contact type is loaded into AAinfo upon reading the contactTypes.dat file
Changed all references accordingly
Line User Rev File contents
1 duarte 207 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5     import java.io.FileNotFoundException;
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11     import java.util.regex.Matcher;
12     import java.util.regex.Pattern;
13    
14 duarte 226 import javax.vecmath.Point3d;
15    
16 duarte 207 public class PdbfilePdb extends Pdb {
17    
18 stehr 301 private static final String UNKNOWN_STRING ="XXXX";
19 duarte 208 private static final String NULL_chainCode = "A";
20    
21 duarte 207 private String pdbfile;
22    
23     /**
24     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
25     * @param pdbfile
26     * @param pdbChainCode
27     * @throws FileNotFoundException
28     * @throws IOException
29 duarte 208 * @throws PdbfileFormatError
30 stehr 215 * @throws PdbChainCodeNotFoundError
31 duarte 207 */
32 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
33 duarte 207 this(pdbfile,pdbChainCode,DEFAULT_MODEL);
34     }
35    
36     /**
37     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
38     * @param pdbfile
39     * @param pdbChainCode
40     * @param model_serial
41     * @throws FileNotFoundException
42     * @throws IOException
43 duarte 208 * @throws PdbfileFormatError
44 stehr 215 * @throws PdbChainCodeNotFoundError
45 duarte 207 */
46 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
47 duarte 207 this.pdbfile = pdbfile;
48     this.model=model_serial;
49 duarte 208 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
50 stehr 217 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
51 duarte 208 // we set chainCode to pdbChainCode except for case NULL where we use "A"
52     this.chainCode=pdbChainCode;
53     if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode;
54    
55 duarte 258 this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file
56 duarte 208
57 stehr 274 // we initialise the secondary structure to empty, if no sec structure info is found then they remain empty
58     this.secondaryStructure = new SecondaryStructure();
59 duarte 207 read_pdb_data_from_file();
60 duarte 278
61     this.obsLength = resser2restype.size();
62    
63 stehr 274 if(!secondaryStructure.isEmpty()) {
64     secondaryStructure.setComment("Author");
65 stehr 259 }
66 duarte 208
67     // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
68     //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
69     this.resser2pdbresser = new HashMap<Integer, String>();
70     this.pdbresser2resser = new HashMap<String, Integer>();
71     for (int resser:resser2restype.keySet()){
72     resser2pdbresser.put(resser, String.valueOf(resser));
73     pdbresser2resser.put(String.valueOf(resser), resser);
74     }
75 duarte 219
76 duarte 237 // initialising atomser2atom from resser_atom2atomserial
77     atomser2atom = new HashMap<Integer, String>();
78     for (String resser_atom:resser_atom2atomserial.keySet()){
79     int atomserial = resser_atom2atomserial.get(resser_atom);
80     String atom = resser_atom.split("_")[1];
81     atomser2atom.put(atomserial,atom);
82     }
83 duarte 207 }
84    
85    
86    
87     /**
88 duarte 219 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
89     * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
90 duarte 207 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
91 duarte 258 * sequence gets set to the sequence read from ATOM lines (i.e. observed residues only)
92 duarte 219 * No insertion codes are parsed or taken into account at the moment. Thus files with
93     * insertion codes will be incorrectly read
94 duarte 207 * @param pdbfile
95     * @throws FileNotFoundException
96     * @throws IOException
97 duarte 208 * @throws PdbfileFormatError
98 stehr 215 * @throws PdbChainCodeNotFoundError
99 duarte 207 */
100 stehr 215 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
101 duarte 207 resser_atom2atomserial = new HashMap<String,Integer>();
102     resser2restype = new HashMap<Integer,String>();
103 duarte 226 atomser2coord = new HashMap<Integer,Point3d>();
104 duarte 207 atomser2resser = new HashMap<Integer,Integer>();
105 stehr 255 Pattern p;
106     Matcher m;
107 duarte 207 boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
108 duarte 208 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
109     String chainCodeStr=pdbChainCode;
110     if (pdbChainCode.equals("NULL")) chainCodeStr=" ";
111    
112 duarte 207 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
113     BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
114 duarte 208 int linecount=0;
115 duarte 207 String line;
116 stehr 255 // read first line
117     if((line = fpdb.readLine()) != null ) {
118     linecount = 1;
119 duarte 219 // HEADER
120 stehr 255 p = Pattern.compile("^HEADER");
121     m = p.matcher(line);
122 duarte 207 if (m.find()){
123     Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
124     Matcher mh = ph.matcher(line);
125     if (mh.find()) {
126     pdbCode=mh.group(1).toLowerCase();
127     }
128 stehr 255 } else { // header not found
129     // check whether this is a Casp prediction file
130     p = Pattern.compile("^PFRMAT TS");
131     m = p.matcher(line);
132     if(m.find()) {
133     // ok, it is
134     pdbCode = "CASP";
135     } else {
136     // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
137     throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format");
138     }
139 duarte 207 }
140 stehr 255 } else {
141     throw new PdbfileFormatError("The file "+pdbfile+" is empty.");
142     }
143     // read further lines
144     while ((line = fpdb.readLine() ) != null ) {
145     linecount++;
146 duarte 258 // SEQRES
147     //SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE
148     p = Pattern.compile("^SEQRES.{5}"+chainCodeStr);
149     m = p.matcher(line);
150     if (m.find()){
151     for (int i=19;i<=67;i+=4) {
152     if (!line.substring(i, i+3).equals(" ")) {
153 duarte 326 if (AAinfo.isValidAA(line.substring(i, i+3))) { // for non-standard aas
154     sequence+= AAinfo.threeletter2oneletter(line.substring(i, i+3));
155 duarte 262 } else {
156     sequence+= NONSTANDARD_AA_LETTER;
157 duarte 261 }
158 duarte 258 }
159     }
160     }
161 duarte 219 // SECONDARY STRUCTURE
162     // helix
163     //HELIX 1 1 LYS A 17 LEU A 26 1
164     // helix ser beg res ser end res ser
165     p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
166     m = p.matcher(line);
167     if (m.find()){
168     int serial = Integer.valueOf(m.group(1).trim());
169     int beg = Integer.valueOf(m.group(2).trim());
170     int end = Integer.valueOf(m.group(3).trim());
171 stehr 274 String ssId = new Character(SecStrucElement.HELIX).toString()+serial;
172     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.HELIX,beg,end,ssId);
173     secondaryStructure.add(ssElem);
174 duarte 219 }
175     // sheet
176     //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
177     // strand ser sheet id beg res ser end res ser
178     p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
179     m = p.matcher(line);
180     if (m.find()){
181     int strandSerial = Integer.valueOf(m.group(1).trim());
182     String sheetId = m.group(2).trim();
183     int beg = Integer.valueOf(m.group(3).trim());
184     int end = Integer.valueOf(m.group(4).trim());
185 stehr 274 String ssId = new Character(SecStrucElement.STRAND).toString()+sheetId+strandSerial;
186     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND,beg,end,ssId);
187     secondaryStructure.add(ssElem);
188 duarte 219 }
189     // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
190     // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
191     // turn
192     //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
193     // turn ser beg res ser end res ser
194     p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
195     m = p.matcher(line);
196     if (m.find()){
197     int serial = Integer.valueOf(m.group(1).trim());
198     int beg = Integer.valueOf(m.group(2).trim());
199     int end = Integer.valueOf(m.group(3).trim());
200 stehr 274 String ssId = new Character(SecStrucElement.TURN).toString()+serial;
201     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.TURN,beg,end,ssId);
202     secondaryStructure.add(ssElem);
203 duarte 219 }
204     // MODEL
205 duarte 207 p = Pattern.compile("^MODEL\\s+(\\d+)");
206     m = p.matcher(line);
207     if (m.find()){
208     thismodel=Integer.parseInt(m.group(1));
209     }
210     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
211 duarte 219 // ATOM
212 duarte 207 p = Pattern.compile("^ATOM");
213     m = p.matcher(line);
214     if (m.find()){
215 duarte 208 // serial atom res_type chain res_ser x y z
216 duarte 263 Pattern pl = Pattern.compile("^.{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
217 duarte 207 Matcher ml = pl.matcher(line);
218     if (ml.find()) {
219     empty=false;
220     int atomserial=Integer.parseInt(ml.group(1).trim());
221     String atom = ml.group(2).trim();
222     String res_type = ml.group(3).trim();
223     int res_serial = Integer.parseInt(ml.group(4).trim());
224     double x = Double.parseDouble(ml.group(5).trim());
225     double y = Double.parseDouble(ml.group(6).trim());
226     double z = Double.parseDouble(ml.group(7).trim());
227 duarte 226 Point3d coords = new Point3d(x,y,z);
228 duarte 326 if (AAinfo.isValidAA(res_type)) {
229 duarte 207 atomser2coord.put(atomserial, coords);
230     atomser2resser.put(atomserial, res_serial);
231     resser2restype.put(res_serial, res_type);
232 duarte 326 if (AAinfo.isValidAtomWithOXT(res_type,atom)){
233 duarte 207 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
234     }
235     }
236     }
237     }
238     }
239     fpdb.close();
240 stehr 215 if (empty) {
241     //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
242     throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
243     }
244 duarte 258 if (sequence.equals("")){
245     // if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap
246     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
247     ArrayList<Integer> ressers = new ArrayList<Integer>();
248     for (int resser:resser2restype.keySet()) {
249     ressers.add(resser);
250     }
251     Collections.sort(ressers);
252     for (int resser:ressers){
253 duarte 326 String oneletter = AAinfo.threeletter2oneletter(resser2restype.get(resser));
254 duarte 258 sequence += oneletter;
255     }
256 duarte 278 // not size but maximum: if residue numbering in pdb file is correct, then this takes care of non-observed except for possible non-observed at end of chain
257     fullLength = Collections.max(resser2restype.keySet());
258     } else { // we read the sequence from SEQRES
259     fullLength = sequence.length();
260 duarte 207 }
261     }
262 duarte 219
263 duarte 207 }