ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 492
Committed: Wed Jan 2 13:18:57 2008 UTC (16 years, 9 months ago) by duarte
File size: 12219 byte(s)
Log Message:
Copied the aglappe-jung branch into trunk.

Line File contents
1 package proteinstructure;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.util.ArrayList;
9 import java.util.Collections;
10 import java.util.HashMap;
11 import java.util.TreeSet;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 import javax.vecmath.Point3d;
16
17 public class PdbfilePdb extends Pdb {
18
19 private static final String UNKNOWN_STRING ="XXXX";
20 private static final String NULL_chainCode = "A";
21
22 private String pdbfile;
23
24 /**
25 * Constructs an empty Pdb object given a pdbfile name
26 * Data will be loaded from pdb file upon call of load(pdbChainCode, modelSerial)
27 * @param pdbfile
28 */
29 public PdbfilePdb (String pdbfile) {
30 this.pdbfile = pdbfile;
31 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
32 this.dataLoaded = false;
33
34 this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file
35
36 // we initialise the secondary structure to empty, if no sec structure info is found then they remain empty
37 this.secondaryStructure = new SecondaryStructure();
38
39 }
40
41 public void load(String pdbChainCode, int modelSerial) throws PdbLoadError {
42 try {
43 this.model=modelSerial;
44 this.pdbChainCode=pdbChainCode; // NOTE! pdb chain codes are case sensitive!
45 // we set chainCode to pdbChainCode except for case NULL where we use "A"
46 this.chainCode=pdbChainCode;
47 if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) this.chainCode=NULL_chainCode;
48
49 read_pdb_data_from_file();
50
51 this.obsLength = resser2restype.size();
52
53 if(!secondaryStructure.isEmpty()) {
54 secondaryStructure.setComment("Author");
55 }
56
57 // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
58 //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
59 this.resser2pdbresser = new HashMap<Integer, String>();
60 this.pdbresser2resser = new HashMap<String, Integer>();
61 for (int resser:resser2restype.keySet()){
62 resser2pdbresser.put(resser, String.valueOf(resser));
63 pdbresser2resser.put(String.valueOf(resser), resser);
64 }
65
66 // initialising atomser2atom from resser_atom2atomserial
67 atomser2atom = new HashMap<Integer, String>();
68 for (String resser_atom:resser_atom2atomserial.keySet()){
69 int atomserial = resser_atom2atomserial.get(resser_atom);
70 String atom = resser_atom.split("_")[1];
71 atomser2atom.put(atomserial,atom);
72 }
73
74 dataLoaded = true;
75
76 } catch (FileNotFoundException e) {
77 throw new PdbLoadError(e);
78 } catch (PdbfileFormatError e) {
79 throw new PdbLoadError(e);
80 } catch (IOException e) {
81 throw new PdbLoadError(e);
82 } catch (PdbChainCodeNotFoundError e) {
83 throw new PdbLoadError(e);
84 }
85 }
86
87 public String[] getChains() throws PdbLoadError {
88 TreeSet<String> chains = new TreeSet<String>();
89 try {
90 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
91 String line;
92 while ((line=fpdb.readLine())!=null) {
93 if (line.startsWith("ATOM")) {
94 String chain = line.substring(21, 22);
95 if (chain.equals(" ")) chain="NULL";
96 chains.add(chain);
97 }
98 }
99 fpdb.close();
100 } catch (IOException e) {
101 throw new PdbLoadError(e);
102 }
103
104 if (chains.isEmpty()) return null;
105
106 String[] chainsArray = new String[chains.size()];
107 chains.toArray(chainsArray);
108 return chainsArray;
109 }
110
111 public Integer[] getModels() throws PdbLoadError {
112 TreeSet<Integer> models = new TreeSet<Integer>();
113 try {
114 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
115 String line;
116 while ((line=fpdb.readLine())!=null) {
117 if (line.startsWith("MODEL")) {
118 int model = Integer.parseInt(line.substring(6,line.length()));
119 models.add(model);
120 }
121 }
122 fpdb.close();
123 } catch (IOException e) {
124 throw new PdbLoadError(e);
125 } catch (NumberFormatException e) {
126 throw new PdbLoadError("Wrong format for MODEL lines!");
127 }
128
129 if (models.isEmpty()) models.add(DEFAULT_MODEL);//return null;
130 Integer[] modelsArray = new Integer[models.size()];
131 models.toArray(modelsArray);
132 return modelsArray;
133 }
134
135 /**
136 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
137 * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
138 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
139 * sequence gets set to the sequence read from ATOM lines (i.e. observed residues only)
140 * No insertion codes are parsed or taken into account at the moment. Thus files with
141 * insertion codes will be incorrectly read
142 * @param pdbfile
143 * @throws FileNotFoundException
144 * @throws IOException
145 * @throws PdbfileFormatError
146 * @throws PdbChainCodeNotFoundError
147 */
148 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
149 resser_atom2atomserial = new HashMap<String,Integer>();
150 resser2restype = new HashMap<Integer,String>();
151 atomser2coord = new HashMap<Integer,Point3d>();
152 atomser2resser = new HashMap<Integer,Integer>();
153 Pattern p;
154 Matcher m;
155 boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
156 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
157 String chainCodeStr=pdbChainCode;
158 if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) chainCodeStr=" ";
159
160 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
161 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
162 int linecount=0;
163 String line;
164 // read first line
165 if((line = fpdb.readLine()) != null ) {
166 linecount = 1;
167 // HEADER
168 p = Pattern.compile("^HEADER");
169 m = p.matcher(line);
170 if (m.find()){
171 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
172 Matcher mh = ph.matcher(line);
173 if (mh.find()) {
174 pdbCode=mh.group(1).toLowerCase();
175 }
176 } else { // header not found
177 // check whether this is a Casp prediction file
178 p = Pattern.compile("^PFRMAT\\s+TS");
179 m = p.matcher(line);
180 if(m.find()) {
181 // ok, it is
182 pdbCode = "CASP";
183 } else {
184 // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
185 throw new PdbfileFormatError("The pdb file "+pdbfile+" does not have a HEADER record");
186 }
187 }
188 } else {
189 throw new PdbfileFormatError("The file "+pdbfile+" is empty.");
190 }
191 // read further lines
192 while ((line = fpdb.readLine() ) != null ) {
193 linecount++;
194 // SEQRES
195 //SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE
196 p = Pattern.compile("^SEQRES.{5}"+chainCodeStr);
197 m = p.matcher(line);
198 if (m.find()){
199 for (int i=19;i<=67;i+=4) {
200 if (!line.substring(i, i+3).equals(" ")) {
201 if (AAinfo.isValidAA(line.substring(i, i+3))) { // for non-standard aas
202 sequence+= AAinfo.threeletter2oneletter(line.substring(i, i+3));
203 } else {
204 sequence+= NONSTANDARD_AA_LETTER;
205 }
206 }
207 }
208 }
209 // SECONDARY STRUCTURE
210 // helix
211 //HELIX 1 1 LYS A 17 LEU A 26 1
212 // helix ser beg res ser end res ser
213 p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
214 m = p.matcher(line);
215 if (m.find()){
216 int serial = Integer.valueOf(m.group(1).trim());
217 int beg = Integer.valueOf(m.group(2).trim());
218 int end = Integer.valueOf(m.group(3).trim());
219 String ssId = new Character(SecStrucElement.HELIX).toString()+serial;
220 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.HELIX,beg,end,ssId);
221 secondaryStructure.add(ssElem);
222 }
223 // sheet
224 //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
225 // strand ser sheet id beg res ser end res ser
226 p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
227 m = p.matcher(line);
228 if (m.find()){
229 int strandSerial = Integer.valueOf(m.group(1).trim());
230 String sheetId = m.group(2).trim();
231 int beg = Integer.valueOf(m.group(3).trim());
232 int end = Integer.valueOf(m.group(4).trim());
233 String ssId = new Character(SecStrucElement.STRAND).toString()+sheetId+strandSerial;
234 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND,beg,end,ssId);
235 secondaryStructure.add(ssElem);
236 }
237 // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
238 // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
239 // turn
240 //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
241 // turn ser beg res ser end res ser
242 p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
243 m = p.matcher(line);
244 if (m.find()){
245 int serial = Integer.valueOf(m.group(1).trim());
246 int beg = Integer.valueOf(m.group(2).trim());
247 int end = Integer.valueOf(m.group(3).trim());
248 String ssId = new Character(SecStrucElement.TURN).toString()+serial;
249 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.TURN,beg,end,ssId);
250 secondaryStructure.add(ssElem);
251 }
252 // MODEL
253 p = Pattern.compile("^MODEL\\s+(\\d+)");
254 m = p.matcher(line);
255 if (m.find()){
256 thismodel=Integer.parseInt(m.group(1));
257 }
258 if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
259 // ATOM
260 p = Pattern.compile("^ATOM");
261 m = p.matcher(line);
262 if (m.find()){
263 // serial atom res_type chain res_ser x y z
264 Pattern pl = Pattern.compile("^.{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
265 Matcher ml = pl.matcher(line);
266 if (ml.find()) {
267 empty=false;
268 int atomserial=Integer.parseInt(ml.group(1).trim());
269 String atom = ml.group(2).trim();
270 String res_type = ml.group(3).trim();
271 int res_serial = Integer.parseInt(ml.group(4).trim());
272 double x = Double.parseDouble(ml.group(5).trim());
273 double y = Double.parseDouble(ml.group(6).trim());
274 double z = Double.parseDouble(ml.group(7).trim());
275 Point3d coords = new Point3d(x,y,z);
276 if (AAinfo.isValidAA(res_type)) {
277 atomser2coord.put(atomserial, coords);
278 atomser2resser.put(atomserial, res_serial);
279 resser2restype.put(res_serial, res_type);
280 if (AAinfo.isValidAtomWithOXT(res_type,atom)){
281 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
282 }
283 }
284 }
285 }
286 }
287 fpdb.close();
288 if (empty) {
289 //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
290 throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
291 }
292 if (sequence.equals("")){
293 // if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap
294 // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
295 ArrayList<Integer> ressers = new ArrayList<Integer>();
296 for (int resser:resser2restype.keySet()) {
297 ressers.add(resser);
298 }
299 Collections.sort(ressers);
300 for (int resser:ressers){
301 String oneletter = AAinfo.threeletter2oneletter(resser2restype.get(resser));
302 sequence += oneletter;
303 }
304 // not size but maximum: if residue numbering in pdb file is correct, then this takes care of non-observed except for possible non-observed at end of chain
305 fullLength = Collections.max(resser2restype.keySet());
306 } else { // we read the sequence from SEQRES
307 if( sequence.length() < Collections.max(resser2restype.keySet())) {
308 throw new PdbfileFormatError("Last residue serial in ATOM lines is bigger than SEQRES length!");
309 }
310 fullLength = sequence.length();
311 }
312 }
313
314 }