1 |
package proteinstructure; |
2 |
|
3 |
import java.io.BufferedReader; |
4 |
import java.io.File; |
5 |
import java.io.FileNotFoundException; |
6 |
import java.io.FileReader; |
7 |
import java.io.IOException; |
8 |
import java.util.ArrayList; |
9 |
import java.util.Collections; |
10 |
import java.util.HashMap; |
11 |
import java.util.TreeSet; |
12 |
import java.util.regex.Matcher; |
13 |
import java.util.regex.Pattern; |
14 |
|
15 |
import javax.vecmath.Point3d; |
16 |
|
17 |
public class PdbfilePdb extends Pdb { |
18 |
|
19 |
private static final String UNKNOWN_STRING ="XXXX"; |
20 |
private static final String NULL_chainCode = "A"; |
21 |
|
22 |
private String pdbfile; |
23 |
|
24 |
/** |
25 |
* Constructs an empty Pdb object given a pdbfile name |
26 |
* Data will be loaded from pdb file upon call of load(pdbChainCode, modelSerial) |
27 |
* @param pdbfile |
28 |
*/ |
29 |
public PdbfilePdb (String pdbfile) { |
30 |
this.pdbfile = pdbfile; |
31 |
this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file |
32 |
this.dataLoaded = false; |
33 |
|
34 |
this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file |
35 |
|
36 |
// we initialise the secondary structure to empty, if no sec structure info is found then they remain empty |
37 |
this.secondaryStructure = new SecondaryStructure(); |
38 |
|
39 |
} |
40 |
|
41 |
public void load(String pdbChainCode, int modelSerial) throws PdbLoadError { |
42 |
try { |
43 |
this.model=modelSerial; |
44 |
this.pdbChainCode=pdbChainCode; // NOTE! pdb chain codes are case sensitive! |
45 |
// we set chainCode to pdbChainCode except for case NULL where we use "A" |
46 |
this.chainCode=pdbChainCode; |
47 |
if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) this.chainCode=NULL_chainCode; |
48 |
|
49 |
read_pdb_data_from_file(); |
50 |
|
51 |
this.obsLength = resser2restype.size(); |
52 |
|
53 |
if(!secondaryStructure.isEmpty()) { |
54 |
secondaryStructure.setComment("Author"); |
55 |
} |
56 |
|
57 |
// when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know |
58 |
//TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used |
59 |
this.resser2pdbresser = new HashMap<Integer, String>(); |
60 |
this.pdbresser2resser = new HashMap<String, Integer>(); |
61 |
for (int resser:resser2restype.keySet()){ |
62 |
resser2pdbresser.put(resser, String.valueOf(resser)); |
63 |
pdbresser2resser.put(String.valueOf(resser), resser); |
64 |
} |
65 |
|
66 |
// initialising atomser2atom from resser_atom2atomserial |
67 |
atomser2atom = new HashMap<Integer, String>(); |
68 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
69 |
int atomserial = resser_atom2atomserial.get(resser_atom); |
70 |
String atom = resser_atom.split("_")[1]; |
71 |
atomser2atom.put(atomserial,atom); |
72 |
} |
73 |
|
74 |
dataLoaded = true; |
75 |
|
76 |
} catch (FileNotFoundException e) { |
77 |
throw new PdbLoadError(e); |
78 |
} catch (PdbfileFormatError e) { |
79 |
throw new PdbLoadError(e); |
80 |
} catch (IOException e) { |
81 |
throw new PdbLoadError(e); |
82 |
} catch (PdbChainCodeNotFoundError e) { |
83 |
throw new PdbLoadError(e); |
84 |
} |
85 |
} |
86 |
|
87 |
public String[] getChains() throws PdbLoadError { |
88 |
TreeSet<String> chains = new TreeSet<String>(); |
89 |
try { |
90 |
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
91 |
String line; |
92 |
while ((line=fpdb.readLine())!=null) { |
93 |
if (line.startsWith("ATOM")) { |
94 |
String chain = line.substring(21, 22); |
95 |
if (chain.equals(" ")) chain="NULL"; |
96 |
chains.add(chain); |
97 |
} |
98 |
} |
99 |
fpdb.close(); |
100 |
} catch (IOException e) { |
101 |
throw new PdbLoadError(e); |
102 |
} |
103 |
|
104 |
if (chains.isEmpty()) return null; |
105 |
|
106 |
String[] chainsArray = new String[chains.size()]; |
107 |
chains.toArray(chainsArray); |
108 |
return chainsArray; |
109 |
} |
110 |
|
111 |
public Integer[] getModels() throws PdbLoadError { |
112 |
TreeSet<Integer> models = new TreeSet<Integer>(); |
113 |
try { |
114 |
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
115 |
String line; |
116 |
while ((line=fpdb.readLine())!=null) { |
117 |
if (line.startsWith("MODEL")) { |
118 |
int model = Integer.parseInt(line.substring(6,line.length())); |
119 |
models.add(model); |
120 |
} |
121 |
} |
122 |
fpdb.close(); |
123 |
} catch (IOException e) { |
124 |
throw new PdbLoadError(e); |
125 |
} catch (NumberFormatException e) { |
126 |
throw new PdbLoadError("Wrong format for MODEL lines!"); |
127 |
} |
128 |
|
129 |
if (models.isEmpty()) models.add(DEFAULT_MODEL);//return null; |
130 |
Integer[] modelsArray = new Integer[models.size()]; |
131 |
models.toArray(modelsArray); |
132 |
return modelsArray; |
133 |
} |
134 |
|
135 |
/** |
136 |
* To read the pdb data (atom coordinates, residue serials, atom serials) from file. |
137 |
* chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A' |
138 |
* pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found |
139 |
* sequence gets set to the sequence read from ATOM lines (i.e. observed residues only) |
140 |
* No insertion codes are parsed or taken into account at the moment. Thus files with |
141 |
* insertion codes will be incorrectly read |
142 |
* @param pdbfile |
143 |
* @throws FileNotFoundException |
144 |
* @throws IOException |
145 |
* @throws PdbfileFormatError |
146 |
* @throws PdbChainCodeNotFoundError |
147 |
*/ |
148 |
private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
149 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
150 |
resser2restype = new HashMap<Integer,String>(); |
151 |
atomser2coord = new HashMap<Integer,Point3d>(); |
152 |
atomser2resser = new HashMap<Integer,Integer>(); |
153 |
Pattern p; |
154 |
Matcher m; |
155 |
boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model |
156 |
// we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files) |
157 |
String chainCodeStr=pdbChainCode; |
158 |
if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) chainCodeStr=" "; |
159 |
|
160 |
int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines |
161 |
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
162 |
int linecount=0; |
163 |
String line; |
164 |
// read first line |
165 |
if((line = fpdb.readLine()) != null ) { |
166 |
linecount = 1; |
167 |
// HEADER |
168 |
p = Pattern.compile("^HEADER"); |
169 |
m = p.matcher(line); |
170 |
if (m.find()){ |
171 |
Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})"); |
172 |
Matcher mh = ph.matcher(line); |
173 |
if (mh.find()) { |
174 |
pdbCode=mh.group(1).toLowerCase(); |
175 |
} |
176 |
} else { // header not found |
177 |
// check whether this is a Casp prediction file |
178 |
p = Pattern.compile("^PFRMAT\\s+TS"); |
179 |
m = p.matcher(line); |
180 |
if(m.find()) { |
181 |
// ok, it is |
182 |
pdbCode = "CASP"; |
183 |
} else { |
184 |
// a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception |
185 |
throw new PdbfileFormatError("The pdb file "+pdbfile+" does not have a HEADER record"); |
186 |
} |
187 |
} |
188 |
} else { |
189 |
throw new PdbfileFormatError("The file "+pdbfile+" is empty."); |
190 |
} |
191 |
// read further lines |
192 |
while ((line = fpdb.readLine() ) != null ) { |
193 |
linecount++; |
194 |
// SEQRES |
195 |
//SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE |
196 |
p = Pattern.compile("^SEQRES.{5}"+chainCodeStr); |
197 |
m = p.matcher(line); |
198 |
if (m.find()){ |
199 |
for (int i=19;i<=67;i+=4) { |
200 |
if (!line.substring(i, i+3).equals(" ")) { |
201 |
if (AAinfo.isValidAA(line.substring(i, i+3))) { // for non-standard aas |
202 |
sequence+= AAinfo.threeletter2oneletter(line.substring(i, i+3)); |
203 |
} else { |
204 |
sequence+= NONSTANDARD_AA_LETTER; |
205 |
} |
206 |
} |
207 |
} |
208 |
} |
209 |
// SECONDARY STRUCTURE |
210 |
// helix |
211 |
//HELIX 1 1 LYS A 17 LEU A 26 1 |
212 |
// helix ser beg res ser end res ser |
213 |
p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)"); |
214 |
m = p.matcher(line); |
215 |
if (m.find()){ |
216 |
int serial = Integer.valueOf(m.group(1).trim()); |
217 |
int beg = Integer.valueOf(m.group(2).trim()); |
218 |
int end = Integer.valueOf(m.group(3).trim()); |
219 |
String ssId = new Character(SecStrucElement.HELIX).toString()+serial; |
220 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.HELIX,beg,end,ssId); |
221 |
secondaryStructure.add(ssElem); |
222 |
} |
223 |
// sheet |
224 |
//SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107 |
225 |
// strand ser sheet id beg res ser end res ser |
226 |
p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)"); |
227 |
m = p.matcher(line); |
228 |
if (m.find()){ |
229 |
int strandSerial = Integer.valueOf(m.group(1).trim()); |
230 |
String sheetId = m.group(2).trim(); |
231 |
int beg = Integer.valueOf(m.group(3).trim()); |
232 |
int end = Integer.valueOf(m.group(4).trim()); |
233 |
String ssId = new Character(SecStrucElement.STRAND).toString()+sheetId+strandSerial; |
234 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND,beg,end,ssId); |
235 |
secondaryStructure.add(ssElem); |
236 |
} |
237 |
// we've stored the sec structure info in the strands2begEnd and sheets2strands maps. |
238 |
// the assignment to resser2secstruct is done when we reach the ATOM lines, see below |
239 |
// turn |
240 |
//TURN 1 S1A GLY A 16 GLN A 18 SURFACE |
241 |
// turn ser beg res ser end res ser |
242 |
p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)"); |
243 |
m = p.matcher(line); |
244 |
if (m.find()){ |
245 |
int serial = Integer.valueOf(m.group(1).trim()); |
246 |
int beg = Integer.valueOf(m.group(2).trim()); |
247 |
int end = Integer.valueOf(m.group(3).trim()); |
248 |
String ssId = new Character(SecStrucElement.TURN).toString()+serial; |
249 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.TURN,beg,end,ssId); |
250 |
secondaryStructure.add(ssElem); |
251 |
} |
252 |
// MODEL |
253 |
p = Pattern.compile("^MODEL\\s+(\\d+)"); |
254 |
m = p.matcher(line); |
255 |
if (m.find()){ |
256 |
thismodel=Integer.parseInt(m.group(1)); |
257 |
} |
258 |
if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model |
259 |
// ATOM |
260 |
p = Pattern.compile("^ATOM"); |
261 |
m = p.matcher(line); |
262 |
if (m.find()){ |
263 |
// serial atom res_type chain res_ser x y z |
264 |
Pattern pl = Pattern.compile("^.{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE); |
265 |
Matcher ml = pl.matcher(line); |
266 |
if (ml.find()) { |
267 |
empty=false; |
268 |
int atomserial=Integer.parseInt(ml.group(1).trim()); |
269 |
String atom = ml.group(2).trim(); |
270 |
String res_type = ml.group(3).trim(); |
271 |
int res_serial = Integer.parseInt(ml.group(4).trim()); |
272 |
double x = Double.parseDouble(ml.group(5).trim()); |
273 |
double y = Double.parseDouble(ml.group(6).trim()); |
274 |
double z = Double.parseDouble(ml.group(7).trim()); |
275 |
Point3d coords = new Point3d(x,y,z); |
276 |
if (AAinfo.isValidAA(res_type)) { |
277 |
atomser2coord.put(atomserial, coords); |
278 |
atomser2resser.put(atomserial, res_serial); |
279 |
resser2restype.put(res_serial, res_type); |
280 |
if (AAinfo.isValidAtomWithOXT(res_type,atom)){ |
281 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
282 |
} |
283 |
} |
284 |
} |
285 |
} |
286 |
} |
287 |
fpdb.close(); |
288 |
if (empty) { |
289 |
//System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model); |
290 |
throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model); |
291 |
} |
292 |
if (sequence.equals("")){ |
293 |
// if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap |
294 |
// NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here! |
295 |
ArrayList<Integer> ressers = new ArrayList<Integer>(); |
296 |
for (int resser:resser2restype.keySet()) { |
297 |
ressers.add(resser); |
298 |
} |
299 |
Collections.sort(ressers); |
300 |
for (int resser:ressers){ |
301 |
String oneletter = AAinfo.threeletter2oneletter(resser2restype.get(resser)); |
302 |
sequence += oneletter; |
303 |
} |
304 |
// not size but maximum: if residue numbering in pdb file is correct, then this takes care of non-observed except for possible non-observed at end of chain |
305 |
fullLength = Collections.max(resser2restype.keySet()); |
306 |
} else { // we read the sequence from SEQRES |
307 |
if( sequence.length() < Collections.max(resser2restype.keySet())) { |
308 |
throw new PdbfileFormatError("Last residue serial in ATOM lines is bigger than SEQRES length!"); |
309 |
} |
310 |
fullLength = sequence.length(); |
311 |
} |
312 |
} |
313 |
|
314 |
} |