1 |
package proteinstructure; |
2 |
|
3 |
import java.io.File; |
4 |
import java.io.FileInputStream; |
5 |
import java.io.FileOutputStream; |
6 |
import java.io.InputStream; |
7 |
import java.io.RandomAccessFile; |
8 |
|
9 |
import java.io.IOException; |
10 |
import java.net.URL; |
11 |
import java.net.URLConnection; |
12 |
import java.util.ArrayList; |
13 |
import java.util.Collections; |
14 |
import java.util.HashMap; |
15 |
import java.util.TreeMap; |
16 |
import java.util.TreeSet; |
17 |
import java.util.regex.Matcher; |
18 |
import java.util.regex.Pattern; |
19 |
import java.util.zip.GZIPInputStream; |
20 |
|
21 |
import javax.vecmath.Point3d; |
22 |
|
23 |
|
24 |
/** |
25 |
* A single chain pdb protein structure loaded from an mmCIF file or downloaded from the PDB FTP site |
26 |
* |
27 |
* @author Jose Duarte |
28 |
*/ |
29 |
public class CiffilePdb extends Pdb { |
30 |
|
31 |
/*------------------------------ constants ------------------------------*/ |
32 |
public static final String PDB_FTP_URL = "ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/mmCIF/"; |
33 |
public static final String CIF_FILE_EXTENSION = ".cif.gz"; |
34 |
|
35 |
/*--------------------------- member variables --------------------------*/ |
36 |
|
37 |
// input file |
38 |
private File cifFile; |
39 |
|
40 |
// fields we will read |
41 |
private static final String entryId = "_entry"; |
42 |
private static final String atomSiteId = "_atom_site"; |
43 |
private static final String atomSitesAltId = "_atom_sites_alt"; |
44 |
private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme"; |
45 |
private static final String structConfId = "_struct_conf"; |
46 |
private static final String structSheetId = "_struct_sheet_range"; |
47 |
private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId}; |
48 |
|
49 |
private TreeMap<String,Integer> ids2elements; // map of ids to element serials |
50 |
private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements) |
51 |
private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements) |
52 |
private TreeMap<String,Integer> ids2fieldsIdx; // map of element ids to field index counter (after parseCifFile method done it contains the total number of fields per element id) |
53 |
private TreeSet<Integer> loopElements; // contains list of elements that are of loop type |
54 |
private TreeMap<Integer,Long[]> loopelements2contentOffset; // begin and end line index of each loop element |
55 |
|
56 |
private String altLoc; |
57 |
|
58 |
private RandomAccessFile fcif; |
59 |
|
60 |
/*----------------------------- constructors ----------------------------*/ |
61 |
|
62 |
/** |
63 |
* Constructs Pdb object from online PDB given pdb code and pdb chain code. |
64 |
* The DEFAULT_MODEL (see superclass) and default PDB_FTP_URL are used. |
65 |
* @param pdbCode |
66 |
* @param pdbChainCode |
67 |
* @throws PdbChainCodeNotFoundError |
68 |
* @throws IOException |
69 |
* @throws CiffileFormatError |
70 |
*/ |
71 |
public CiffilePdb(String pdbCode, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
72 |
this(pdbCode, pdbChainCode, DEFAULT_MODEL, PDB_FTP_URL); |
73 |
} |
74 |
|
75 |
/** |
76 |
* Constructs Pdb object from online PDB given pdb code, pdb chain code and model serial. |
77 |
* The default PDB_FTP_URL is used. |
78 |
* @param pdbCode |
79 |
* @param pdbChainCode |
80 |
* @param model_serial |
81 |
* @throws PdbChainCodeNotFoundError |
82 |
* @throws IOException |
83 |
* @throws CiffileFormatError |
84 |
*/ |
85 |
public CiffilePdb(String pdbCode, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
86 |
this(pdbCode, pdbChainCode, model_serial, PDB_FTP_URL); |
87 |
} |
88 |
|
89 |
/** |
90 |
* Constructs Pdb object from online PDB given pdb code, pdb chain code and pdbFtpUrl. |
91 |
* Model will be DEFAULT_MODEL (see superclass). |
92 |
* @param pdbCode |
93 |
* @param pdbChainCode |
94 |
* @param pdbFtpUrl |
95 |
* @throws PdbChainCodeNotFoundError |
96 |
* @throws IOException |
97 |
* @throws CiffileFormatError |
98 |
*/ |
99 |
public CiffilePdb (String pdbCode, String pdbChainCode, String pdbFtpUrl) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
100 |
this(pdbCode, pdbChainCode, DEFAULT_MODEL, pdbFtpUrl); |
101 |
} |
102 |
|
103 |
/** |
104 |
* Constructs Pdb object from online PDB given pdb code, pdb chain code, model serial and pdbFtpUrl |
105 |
* @param pdbCode |
106 |
* @param pdbChainCode |
107 |
* @param model_serial |
108 |
* @param pdbFtpUrl |
109 |
* @throws PdbChainCodeNotFoundError |
110 |
* @throws IOException |
111 |
* @throws CiffileFormatError |
112 |
*/ |
113 |
public CiffilePdb (String pdbCode, String pdbChainCode, int model_serial, String pdbFtpUrl) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
114 |
String tempDir = System.getProperty("java.io.tmpdir"); // TODO: read from FTP directly |
115 |
String gzCifFileName = pdbCode+CIF_FILE_EXTENSION; |
116 |
File gzCifFile = new File(tempDir,gzCifFileName); |
117 |
gzCifFile.deleteOnExit(); |
118 |
this.cifFile = new File(tempDir,pdbCode + ".cif"); |
119 |
this.cifFile.deleteOnExit(); |
120 |
|
121 |
// getting gzipped cif file from ftp |
122 |
URL url = new URL(pdbFtpUrl+gzCifFileName); |
123 |
URLConnection urlc = url.openConnection(); |
124 |
InputStream is = urlc.getInputStream(); |
125 |
FileOutputStream os = new FileOutputStream(gzCifFile); |
126 |
int b; |
127 |
while ( (b=is.read())!=-1) { |
128 |
os.write(b); |
129 |
} |
130 |
is.close(); |
131 |
os.close(); |
132 |
|
133 |
// unzipping downloaded file |
134 |
GZIPInputStream zis = new GZIPInputStream(new FileInputStream(gzCifFile)); |
135 |
os = new FileOutputStream(cifFile); |
136 |
while ( (b=zis.read())!=-1) { |
137 |
os.write(b); |
138 |
} |
139 |
zis.close(); |
140 |
os.close(); |
141 |
|
142 |
// here we would like to call the constructor this(ciffile, pdbChainCode, model_serial); which does not work, so we use copy/paste: |
143 |
|
144 |
// load from temp file |
145 |
this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case |
146 |
this.model=model_serial; |
147 |
|
148 |
fcif = new RandomAccessFile(cifFile,"r"); |
149 |
parseCifFile(); |
150 |
fcif.close(); |
151 |
|
152 |
this.fullLength = sequence.length(); |
153 |
this.obsLength = resser2restype.size(); |
154 |
if(!secondaryStructure.isEmpty()) { |
155 |
secondaryStructure.setComment("CIFfile"); |
156 |
} |
157 |
|
158 |
// we initialise resser2pdbresser from the pdbresser2resser HashMap |
159 |
this.resser2pdbresser = new HashMap<Integer, String>(); |
160 |
for (String pdbresser:pdbresser2resser.keySet()){ |
161 |
resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser); |
162 |
} |
163 |
|
164 |
// initialising atomser2atom from resser_atom2atomserial |
165 |
atomser2atom = new HashMap<Integer, String>(); |
166 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
167 |
int atomserial = resser_atom2atomserial.get(resser_atom); |
168 |
String atom = resser_atom.split("_")[1]; |
169 |
atomser2atom.put(atomserial,atom); |
170 |
} |
171 |
} |
172 |
|
173 |
/** |
174 |
* Constructs Pdb object given cif file and pdb chain code. |
175 |
* Model will be DEFAULT_MODEL |
176 |
* @param ciffile |
177 |
* @param pdbChainCode |
178 |
* @throws PdbChainCodeNotFoundError |
179 |
* @throws IOException |
180 |
* @throws CiffileFormatError |
181 |
*/ |
182 |
public CiffilePdb (File ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
183 |
this(ciffile, pdbChainCode, DEFAULT_MODEL); |
184 |
} |
185 |
|
186 |
/** |
187 |
* Constructs Pdb object given cif file, pdb chain code and model serial |
188 |
* @param ciffile |
189 |
* @param pdbChainCode |
190 |
* @param model_serial |
191 |
* @throws PdbChainCodeNotFoundError |
192 |
* @throws IOException |
193 |
* @throws CiffileFormatError |
194 |
*/ |
195 |
public CiffilePdb (File ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
196 |
this.cifFile = ciffile; |
197 |
this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case |
198 |
this.model=model_serial; |
199 |
|
200 |
fcif = new RandomAccessFile(cifFile,"r"); |
201 |
parseCifFile(); |
202 |
fcif.close(); |
203 |
|
204 |
this.fullLength = sequence.length(); |
205 |
this.obsLength = resser2restype.size(); |
206 |
if(!secondaryStructure.isEmpty()) { |
207 |
secondaryStructure.setComment("CIFfile"); |
208 |
} |
209 |
|
210 |
// we initialise resser2pdbresser from the pdbresser2resser HashMap |
211 |
this.resser2pdbresser = new HashMap<Integer, String>(); |
212 |
for (String pdbresser:pdbresser2resser.keySet()){ |
213 |
resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser); |
214 |
} |
215 |
|
216 |
// initialising atomser2atom from resser_atom2atomserial |
217 |
atomser2atom = new HashMap<Integer, String>(); |
218 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
219 |
int atomserial = resser_atom2atomserial.get(resser_atom); |
220 |
String atom = resser_atom.split("_")[1]; |
221 |
atomser2atom.put(atomserial,atom); |
222 |
} |
223 |
} |
224 |
|
225 |
/*---------------------------- private methods --------------------------*/ |
226 |
|
227 |
private void parseCifFile() throws IOException, CiffileFormatError, PdbChainCodeNotFoundError{ |
228 |
// data structures to store the parsed fields |
229 |
ids2elements = new TreeMap<String, Integer>(); |
230 |
fields2indices = new TreeMap<String,Integer>(); |
231 |
fields2values = new TreeMap<String, String>(); |
232 |
loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type |
233 |
loopelements2contentOffset = new TreeMap<Integer,Long[]>(); |
234 |
ids2fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id |
235 |
|
236 |
int element = 0; |
237 |
String line; |
238 |
line = fcif.readLine(); // read first line |
239 |
Pattern p = Pattern.compile("^data_\\d\\w\\w\\w"); |
240 |
if (!p.matcher(line).find()){ |
241 |
throw new CiffileFormatError("The file doesn't seem to be a cif file"); |
242 |
} |
243 |
int linecount = 1; // we have read one line already, we initialise count to 1 |
244 |
// we need to store the last line's byte offset (which indicates the beginning of this line) |
245 |
long lastLineOffset=fcif.getFilePointer(); |
246 |
while((line = fcif.readLine()) != null ) { |
247 |
long currentOffset = fcif.getFilePointer(); //this gets byte offset at end of line |
248 |
linecount++; |
249 |
if (line.startsWith("#")) { |
250 |
element++; |
251 |
continue; |
252 |
} |
253 |
if (line.startsWith("loop_")) { |
254 |
loopElements.add(element); |
255 |
continue; |
256 |
} |
257 |
|
258 |
for (String id:ids){ |
259 |
if (!ids2fieldsIdx.containsKey(id)) ids2fieldsIdx.put(id,0); |
260 |
p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$"); |
261 |
Matcher m = p.matcher(line); |
262 |
if (m.find()){ |
263 |
ids2elements.put(id,element); |
264 |
String field = id + "." + m.group(1); |
265 |
if (!loopElements.contains(element)) { // if not a loop element |
266 |
fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name |
267 |
} else { // for loop elements we fill the fields2indices TreeMap |
268 |
fields2indices.put(field,ids2fieldsIdx.get(id)); |
269 |
} |
270 |
ids2fieldsIdx.put(id,ids2fieldsIdx.get(id)+1); |
271 |
continue; |
272 |
} |
273 |
} |
274 |
if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element |
275 |
if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids) |
276 |
if (!loopelements2contentOffset.containsKey(element)) { |
277 |
//loopelements2content.put(element,line+"\n"); |
278 |
Long[] interval = {lastLineOffset, currentOffset}; |
279 |
loopelements2contentOffset.put(element,interval); |
280 |
} else { |
281 |
//loopelements2content.put(element,loopelements2content.get(element)+line+"\n"); |
282 |
loopelements2contentOffset.get(element)[1]=currentOffset; |
283 |
} |
284 |
} |
285 |
} |
286 |
lastLineOffset = currentOffset; //we store this line's offset to have it for next iteration |
287 |
} // end scanning lines |
288 |
|
289 |
// now reading separate elements separately using private methods |
290 |
// the order in the elements in the file is not guaranteed, that's why (among other reasons) we have to use RandomAccessFile |
291 |
this.pdbCode = readPdbCode(); |
292 |
readAtomAltLocs(); // sets altLoc String (needed in readAtomSite to get the right alt atom locations) |
293 |
readPdbxPolySeq(); // sets chainCode, sequence, pdbresser2resser |
294 |
readAtomSite(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser |
295 |
secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null |
296 |
readSecStructure(); // populates secondaryStructure |
297 |
|
298 |
} |
299 |
|
300 |
private String readPdbCode(){ |
301 |
return fields2values.get(entryId+".id").trim(); |
302 |
} |
303 |
|
304 |
private void readAtomAltLocs() throws IOException, CiffileFormatError { |
305 |
// The read of the atom_sites_alt element must be done previously to scanning the atom_site element |
306 |
// This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site |
307 |
// (and altLoc needs to be set before starting reading the atom_site element) |
308 |
|
309 |
ArrayList<String> altLocs = new ArrayList<String>(); |
310 |
// we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value |
311 |
altLoc = "."; |
312 |
|
313 |
// atom_sites_alt element is optional |
314 |
Long[] intAtomSitesAlt = null; |
315 |
if (ids2elements.containsKey(atomSitesAltId)){ |
316 |
intAtomSitesAlt = loopelements2contentOffset.get(ids2elements.get(atomSitesAltId)); |
317 |
} |
318 |
|
319 |
int recordCount = 0; |
320 |
// atom_sites_alt (optional element) |
321 |
if (intAtomSitesAlt!=null) { |
322 |
|
323 |
fcif.seek(intAtomSitesAlt[0]); |
324 |
while(fcif.getFilePointer()<intAtomSitesAlt[1]) { |
325 |
recordCount++; |
326 |
|
327 |
int idIdx = fields2indices.get(atomSitesAltId+".id"); |
328 |
// id=0 |
329 |
// A ? |
330 |
int numberFields = ids2fieldsIdx.get(atomSitesAltId); |
331 |
String[] tokens = tokeniseFields(numberFields); |
332 |
if (tokens.length!=numberFields) { |
333 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSitesAltId); |
334 |
} |
335 |
if (!tokens[idIdx].equals(".")) { |
336 |
altLocs.add(tokens[idIdx]); |
337 |
} |
338 |
} |
339 |
} |
340 |
if (!altLocs.isEmpty()){ |
341 |
altLoc = Collections.min(altLocs); |
342 |
} |
343 |
} |
344 |
|
345 |
private void readAtomSite() throws IOException, PdbChainCodeNotFoundError, CiffileFormatError { |
346 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
347 |
resser2restype = new HashMap<Integer,String>(); |
348 |
atomser2coord = new HashMap<Integer,Point3d>(); |
349 |
atomser2resser = new HashMap<Integer,Integer>(); |
350 |
|
351 |
Long[] intAtomSite = loopelements2contentOffset.get(ids2elements.get(atomSiteId)); |
352 |
|
353 |
boolean empty = true; |
354 |
int recordCount = 0; |
355 |
|
356 |
fcif.seek(intAtomSite[0]); |
357 |
while(fcif.getFilePointer()<intAtomSite[1]) { |
358 |
recordCount++; |
359 |
|
360 |
int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB"); |
361 |
int idIdx = fields2indices.get(atomSiteId+".id"); |
362 |
int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id"); |
363 |
int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id"); |
364 |
int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id"); |
365 |
int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id"); |
366 |
int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id"); |
367 |
int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x"); |
368 |
int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y"); |
369 |
int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z"); |
370 |
int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num"); |
371 |
// group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12 |
372 |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24 |
373 |
//ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1 |
374 |
int numberFields = ids2fieldsIdx.get(atomSiteId); |
375 |
String[] tokens = tokeniseFields(numberFields); |
376 |
if (tokens.length!=numberFields) { |
377 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSiteId); |
378 |
} |
379 |
if (tokens[groupPdbIdx].equals("ATOM") && tokens[labelAsymIdIdx].equals(chainCode) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model |
380 |
empty = false; |
381 |
if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc |
382 |
int atomserial=Integer.parseInt(tokens[idIdx]); // id |
383 |
String atom = tokens[labelAtomIdIdx]; // label_atom_id |
384 |
String res_type = tokens[labelCompIdIdx]; // label_comp_id |
385 |
int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id |
386 |
double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x |
387 |
double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y |
388 |
double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z |
389 |
Point3d coords = new Point3d(x,y,z); |
390 |
if (AAinfo.isValidAA(res_type)) { |
391 |
atomser2coord.put(atomserial, coords); |
392 |
atomser2resser.put(atomserial, res_serial); |
393 |
resser2restype.put(res_serial, res_type); |
394 |
if (AAinfo.isValidAtomWithOXT(res_type,atom)){ |
395 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
396 |
} |
397 |
} |
398 |
} |
399 |
} |
400 |
} |
401 |
if (empty) { // no atom data was found for given pdb chain code and model |
402 |
throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model); |
403 |
} |
404 |
} |
405 |
|
406 |
private void readPdbxPolySeq() throws IOException, CiffileFormatError { |
407 |
pdbresser2resser = new HashMap<String, Integer>(); |
408 |
sequence = ""; |
409 |
|
410 |
String chainCodeStr=pdbChainCode; |
411 |
if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) chainCodeStr="A"; |
412 |
|
413 |
Long[] intPdbxPoly = loopelements2contentOffset.get(ids2elements.get(pdbxPolySeqId)); |
414 |
|
415 |
int recordCount=0; |
416 |
|
417 |
fcif.seek(intPdbxPoly[0]); |
418 |
while(fcif.getFilePointer()<intPdbxPoly[1]) { |
419 |
recordCount++; |
420 |
|
421 |
int asymIdIdx = fields2indices.get(pdbxPolySeqId+".asym_id"); |
422 |
int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id"); |
423 |
int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num"); |
424 |
int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code"); |
425 |
int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id"); |
426 |
int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id"); |
427 |
// asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3 |
428 |
// 0 1 2 3 4 5 6 7 8 910 |
429 |
// A 1 1 ASP 1 1 1 ASP ASP A . |
430 |
int numberFields = ids2fieldsIdx.get(pdbxPolySeqId); |
431 |
String[] tokens = tokeniseFields(numberFields); |
432 |
if (tokens.length!=numberFields) { |
433 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+pdbxPolySeqId); |
434 |
} |
435 |
if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site) |
436 |
int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id |
437 |
chainCode = tokens[asymIdIdx]; |
438 |
//TODO revise: do we want auth_seq_num or pdb_seq_num here?? |
439 |
String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num |
440 |
String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code |
441 |
String pdb_res_serial_with_icode = pdb_res_serial; |
442 |
if (!pdb_ins_code.equals(".")) { |
443 |
pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code; |
444 |
} |
445 |
String res_type = tokens[monIdIdx]; // mon_id |
446 |
// sequence |
447 |
if (AAinfo.isValidAA(res_type)){ |
448 |
sequence+=AAinfo.threeletter2oneletter(res_type); |
449 |
} else { |
450 |
sequence+=NONSTANDARD_AA_LETTER; |
451 |
} |
452 |
// pdbresser2resser |
453 |
if (!pdb_res_serial_with_icode.startsWith("?")) { // question marks are author missing serials, we don't want them in the map |
454 |
pdbresser2resser.put(pdb_res_serial_with_icode,res_serial); |
455 |
} |
456 |
} |
457 |
} |
458 |
} |
459 |
|
460 |
private void readSecStructure() throws IOException, CiffileFormatError { |
461 |
secondaryStructure = new SecondaryStructure(); |
462 |
|
463 |
// struct_conf element is optional |
464 |
Long[] intStructConf = null; |
465 |
if (ids2elements.containsKey(structConfId)) { |
466 |
// if not a loop element then intStructConf stays null (because loopelements2contentIndex will return null) |
467 |
intStructConf = loopelements2contentOffset.get(ids2elements.get(structConfId)); |
468 |
} |
469 |
// taking care of cases where struct_conf is not a loop element but a one value field |
470 |
if (ids2elements.containsKey(structConfId) && !loopElements.contains(ids2elements.get(structConfId))){ |
471 |
String begChainCode = fields2values.get(structConfId+".beg_label_asym_id").trim(); |
472 |
if (begChainCode.equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme |
473 |
String id = fields2values.get(structConfId+".id").trim(); |
474 |
int beg = Integer.parseInt(fields2values.get(structConfId+".beg_label_seq_id").trim()); |
475 |
int end = Integer.parseInt(fields2values.get(structConfId+".end_label_seq_id").trim()); |
476 |
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
477 |
Matcher m = p.matcher(id); |
478 |
String ssId="Unknown"; |
479 |
if (m.find()){ |
480 |
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
481 |
} |
482 |
char ssType = SecStrucElement.OTHER; |
483 |
if(id.startsWith("H")) { |
484 |
ssType = SecStrucElement.HELIX; |
485 |
} else if(id.startsWith("T")) { |
486 |
ssType = SecStrucElement.TURN; |
487 |
} else { |
488 |
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
489 |
} |
490 |
if(ssType != SecStrucElement.OTHER) { |
491 |
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
492 |
secondaryStructure.add(ssElem); |
493 |
} |
494 |
} |
495 |
} |
496 |
// struct_sheet_range element is optional |
497 |
Long[] intStructSheet = null; |
498 |
if (ids2elements.containsKey(structSheetId)) { |
499 |
// if not a loop element intStructSheet stays null (because loopelements2contentIndex will return null) |
500 |
intStructSheet = loopelements2contentOffset.get(ids2elements.get(structSheetId)); |
501 |
} |
502 |
// taking care of cases where struct_sheet_range is not a loop element but a one value field |
503 |
if (ids2elements.containsKey(structSheetId) && !loopElements.contains(ids2elements.get(structSheetId))){ |
504 |
String begChainCode = fields2values.get(structSheetId+".beg_label_asym_id").trim(); |
505 |
if (begChainCode.equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme |
506 |
String sheetid = fields2values.get(structSheetId+".sheet_id").trim(); //tokens[sheetIdIdx]; |
507 |
int id = Integer.parseInt(fields2values.get(structSheetId+".id").trim()); //Integer.parseInt(tokens[idIdx]); |
508 |
int beg = Integer.parseInt(fields2values.get(structSheetId+".beg_label_seq_id").trim()); //tokens[begLabelSeqIdIdx]); |
509 |
int end = Integer.parseInt(fields2values.get(structSheetId+".end_label_seq_id").trim()); //tokens[endLabelSeqIdIdx]); |
510 |
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
511 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
512 |
secondaryStructure.add(ssElem); |
513 |
} |
514 |
|
515 |
} |
516 |
|
517 |
if (intStructConf!=null) { |
518 |
int recordCount=0; |
519 |
|
520 |
fcif.seek(intStructConf[0]); |
521 |
while(fcif.getFilePointer()<intStructConf[1]) { |
522 |
recordCount++; |
523 |
// struct_conf (optional element), HELIX and TURN secondary structure |
524 |
|
525 |
int idIdx = fields2indices.get(structConfId+".id"); |
526 |
int begLabelAsymIdIdx = fields2indices.get(structConfId+".beg_label_asym_id"); |
527 |
int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id"); |
528 |
int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id"); |
529 |
//id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4 |
530 |
// 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19 |
531 |
//HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11 |
532 |
int numFields = ids2fieldsIdx.get(structConfId); |
533 |
String[] tokens = tokeniseFields(numFields); |
534 |
if (tokens.length!=numFields) { |
535 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structConfId); |
536 |
} |
537 |
if (tokens[begLabelAsymIdIdx].equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme |
538 |
String id = tokens[idIdx]; |
539 |
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
540 |
Matcher m = p.matcher(id); |
541 |
String ssId="Unknown"; |
542 |
if (m.find()){ |
543 |
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
544 |
} |
545 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
546 |
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
547 |
char ssType = SecStrucElement.OTHER; |
548 |
if(id.startsWith("H")) { |
549 |
ssType = SecStrucElement.HELIX; |
550 |
} else if(id.startsWith("T")) { |
551 |
ssType = SecStrucElement.TURN; |
552 |
} else { |
553 |
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
554 |
} |
555 |
if(ssType != SecStrucElement.OTHER) { |
556 |
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
557 |
secondaryStructure.add(ssElem); |
558 |
} |
559 |
} |
560 |
} |
561 |
} |
562 |
if (intStructSheet!=null) { |
563 |
int recordCount=0; |
564 |
|
565 |
fcif.seek(intStructSheet[0]); |
566 |
while(fcif.getFilePointer()<intStructSheet[1]) { |
567 |
recordCount++; |
568 |
// struct_sheet_range (optional element), SHEETs |
569 |
int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id"); |
570 |
int idIdx = fields2indices.get(structSheetId+".id"); |
571 |
int begLabelAsymIdIdx = fields2indices.get(structSheetId+".beg_label_asym_id"); |
572 |
int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id"); |
573 |
int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id"); |
574 |
//sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3 |
575 |
//0 1 2 3 4 5 6 7 8 910 1112 13 1415 16 |
576 |
//A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19 |
577 |
int numFields = ids2fieldsIdx.get(structSheetId); |
578 |
String[] tokens = tokeniseFields(numFields); |
579 |
if (tokens.length!=numFields) { |
580 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structSheetId); |
581 |
} |
582 |
if (tokens[begLabelAsymIdIdx].equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme |
583 |
String sheetid = tokens[sheetIdIdx]; |
584 |
int id = Integer.parseInt(tokens[idIdx]); |
585 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
586 |
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
587 |
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
588 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
589 |
secondaryStructure.add(ssElem); |
590 |
} |
591 |
} |
592 |
} |
593 |
} |
594 |
|
595 |
/** |
596 |
* Splits a space separated line into its individual tokens returning an array with all tokens |
597 |
* Takes care of all particularities of the format of a record in the ciffiles: |
598 |
* - fields within records are separated by spaces |
599 |
* - spaces can be used within quoted strings (at the moment this only supports single quotes, not double) |
600 |
* - free style with all characters allowed if something is quoted with \n; ;\n |
601 |
* The java class StreamTokenizer could have done all this, but it was limited to do all that we needed to do |
602 |
* |
603 |
* |
604 |
* This method is black magic. I don't even try to understand as I write it. |
605 |
* If you need to come back to this and read it, good luck!! |
606 |
* |
607 |
* @param numberTokens |
608 |
* @return |
609 |
*/ |
610 |
private String[] tokeniseFields(int numberTokens) throws IOException { |
611 |
String[] tokens = new String[numberTokens]; |
612 |
// initialise tokens to empty strings |
613 |
for (int i=0; i<numberTokens;i++){ |
614 |
tokens[i]=""; |
615 |
} |
616 |
|
617 |
int i = 0; |
618 |
char lastChar=' '; |
619 |
boolean withinQuotes=false; |
620 |
while (true) { |
621 |
char currentChar = (char)fcif.readByte(); |
622 |
if ((currentChar=='\'' && lastChar==' ') || (!withinQuotes && currentChar==';' && lastChar=='\n')){ |
623 |
withinQuotes = true; |
624 |
} |
625 |
else if ((currentChar==' ' && lastChar=='\'') || (currentChar=='\n' && lastChar==';')){ |
626 |
withinQuotes = false; |
627 |
} |
628 |
|
629 |
if (!withinQuotes) { |
630 |
if (currentChar==' ' || currentChar=='\n') { |
631 |
if (currentChar!=lastChar && !(currentChar=='\n' && lastChar==' ')) i++; // we only increment when we move from a non-space to a space or from non-space to \n |
632 |
} else { |
633 |
tokens[i]+=currentChar; |
634 |
} |
635 |
} else { |
636 |
tokens[i]+=currentChar; |
637 |
// if string is surrounded by '' then strip them out (except when string is length 1 and thus beginning and end are \') |
638 |
if (tokens[i].length()!=1 && tokens[i].startsWith("'") && tokens[i].endsWith("'")) tokens[i]=tokens[i].replaceAll("'", ""); |
639 |
// if string is surrounded by ;; then strip them out including a \n before the last ; |
640 |
if (tokens[i].startsWith(";") && tokens[i].endsWith("\n;")) { |
641 |
tokens[i]=tokens[i].replaceFirst("^;", ""); |
642 |
tokens[i]=tokens[i].replaceFirst("\n;",""); |
643 |
} |
644 |
} |
645 |
|
646 |
lastChar = currentChar; |
647 |
|
648 |
if (i==numberTokens) { |
649 |
fcif.readByte();// read one more byte: the space at the end of the line that otherwise we miss |
650 |
return tokens; |
651 |
} |
652 |
} |
653 |
} |
654 |
|
655 |
} |