1 |
package proteinstructure; |
2 |
|
3 |
import java.io.File; |
4 |
import java.io.FileInputStream; |
5 |
import java.io.FileOutputStream; |
6 |
import java.io.InputStream; |
7 |
import java.io.RandomAccessFile; |
8 |
|
9 |
import java.io.IOException; |
10 |
import java.net.URL; |
11 |
import java.net.URLConnection; |
12 |
import java.util.ArrayList; |
13 |
import java.util.Collections; |
14 |
import java.util.HashMap; |
15 |
import java.util.TreeMap; |
16 |
import java.util.TreeSet; |
17 |
import java.util.regex.Matcher; |
18 |
import java.util.regex.Pattern; |
19 |
import java.util.zip.GZIPInputStream; |
20 |
|
21 |
import javax.vecmath.Point3d; |
22 |
|
23 |
|
24 |
/** |
25 |
* A single chain pdb protein structure loaded from an mmCIF file or downloaded from the PDB FTP site |
26 |
* |
27 |
* @author Jose Duarte |
28 |
*/ |
29 |
public class CiffilePdb extends Pdb { |
30 |
|
31 |
/*------------------------------ constants ------------------------------*/ |
32 |
public static final String PDB_FTP_URL = "ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/mmCIF/"; |
33 |
public static final String CIF_FILE_EXTENSION = ".cif.gz"; |
34 |
|
35 |
/*--------------------------- member variables --------------------------*/ |
36 |
|
37 |
// input file |
38 |
private File cifFile; |
39 |
|
40 |
// fields we will read |
41 |
private static final String entryId = "_entry"; |
42 |
private static final String atomSiteId = "_atom_site"; |
43 |
private static final String atomSitesAltId = "_atom_sites_alt"; |
44 |
private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme"; |
45 |
private static final String structConfId = "_struct_conf"; |
46 |
private static final String structSheetId = "_struct_sheet_range"; |
47 |
private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId}; |
48 |
|
49 |
private TreeMap<String,Integer> ids2elements; // map of ids to element serials |
50 |
private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements) |
51 |
private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements) |
52 |
private TreeMap<String,Integer> ids2fieldsIdx; // map of element ids to field index counter (after parseCifFile method done it contains the total number of fields per element id) |
53 |
private TreeSet<Integer> loopElements; // contains list of elements that are of loop type |
54 |
private TreeMap<Integer,Long[]> loopelements2contentOffset; // begin and end line index of each loop element |
55 |
|
56 |
private String altLoc; |
57 |
|
58 |
private RandomAccessFile fcif; |
59 |
|
60 |
private boolean fieldsTitlesRead; |
61 |
|
62 |
/*----------------------------- constructors ----------------------------*/ |
63 |
|
64 |
/** |
65 |
* Constructs an empty Pdb object from online PDB given pdb code |
66 |
* Data will be downloaded an stored in local file |
67 |
* but will only be loaded from local file upon call of load(pdbChainCode, modelSerial) |
68 |
* The default PDB_FTP_URL is used. |
69 |
* @param pdbCode |
70 |
* @throws IOException |
71 |
*/ |
72 |
public CiffilePdb(String pdbCode) throws IOException { |
73 |
this(pdbCode, PDB_FTP_URL); |
74 |
} |
75 |
|
76 |
/** |
77 |
* Constructs an empty Pdb object from online PDB given pdb code and pdbFtpUrl |
78 |
* Data will be downloaded an stored in local file |
79 |
* but will only be loaded from local file upon call of load(pdbChainCode, modelSerial) |
80 |
* @param pdbCode |
81 |
* @param pdbFtpUrl |
82 |
* @throws IOException |
83 |
*/ |
84 |
public CiffilePdb (String pdbCode, String pdbFtpUrl) throws IOException { |
85 |
this.dataLoaded = false; |
86 |
this.fieldsTitlesRead = false; |
87 |
|
88 |
// we store the file locally instead of reading directly from the ftp stream, so that the file can be cached locally in applications like CMView |
89 |
String tempDir = System.getProperty("java.io.tmpdir"); |
90 |
String gzCifFileName = pdbCode+CIF_FILE_EXTENSION; |
91 |
File gzCifFile = new File(tempDir,gzCifFileName); |
92 |
gzCifFile.deleteOnExit(); |
93 |
this.cifFile = new File(tempDir,pdbCode + ".cif"); |
94 |
this.cifFile.deleteOnExit(); |
95 |
|
96 |
// getting gzipped cif file from ftp |
97 |
URL url = new URL(pdbFtpUrl+gzCifFileName); |
98 |
URLConnection urlc = url.openConnection(); |
99 |
InputStream is = urlc.getInputStream(); |
100 |
FileOutputStream os = new FileOutputStream(gzCifFile); |
101 |
int b; |
102 |
while ( (b=is.read())!=-1) { |
103 |
os.write(b); |
104 |
} |
105 |
is.close(); |
106 |
os.close(); |
107 |
|
108 |
// unzipping downloaded file |
109 |
GZIPInputStream zis = new GZIPInputStream(new FileInputStream(gzCifFile)); |
110 |
os = new FileOutputStream(cifFile); |
111 |
while ( (b=zis.read())!=-1) { |
112 |
os.write(b); |
113 |
} |
114 |
zis.close(); |
115 |
os.close(); |
116 |
|
117 |
} |
118 |
|
119 |
/** |
120 |
* Constructs an empty Pdb object given cif file |
121 |
* Data will be loaded from file upon call of load(pdbChainCode, modelSerial) |
122 |
* @param ciffile |
123 |
*/ |
124 |
public CiffilePdb (File ciffile) { |
125 |
this.cifFile = ciffile; |
126 |
this.dataLoaded = false; |
127 |
this.fieldsTitlesRead = false; |
128 |
|
129 |
} |
130 |
|
131 |
public File getCifFile() { |
132 |
return cifFile; |
133 |
} |
134 |
|
135 |
public void load(String pdbChainCode, int modelSerial) throws PdbLoadError{ |
136 |
try { |
137 |
this.model = modelSerial; |
138 |
this.pdbChainCode=pdbChainCode; // NOTE! pdb chain codes are case sensitive |
139 |
fcif = new RandomAccessFile(cifFile,"r"); |
140 |
parseCifFile(); |
141 |
fcif.close(); |
142 |
|
143 |
this.fullLength = sequence.length(); |
144 |
this.obsLength = resser2restype.size(); |
145 |
if(!secondaryStructure.isEmpty()) { |
146 |
secondaryStructure.setComment("CIFfile"); |
147 |
} |
148 |
|
149 |
// we initialise resser2pdbresser from the pdbresser2resser HashMap |
150 |
this.resser2pdbresser = new HashMap<Integer, String>(); |
151 |
for (String pdbresser:pdbresser2resser.keySet()){ |
152 |
resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser); |
153 |
} |
154 |
|
155 |
// initialising atomser2atom from resser_atom2atomserial |
156 |
atomser2atom = new HashMap<Integer, String>(); |
157 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
158 |
int atomserial = resser_atom2atomserial.get(resser_atom); |
159 |
String atom = resser_atom.split("_")[1]; |
160 |
atomser2atom.put(atomserial,atom); |
161 |
} |
162 |
|
163 |
dataLoaded = true; |
164 |
|
165 |
} catch (CiffileFormatError e) { |
166 |
throw new PdbLoadError(e); |
167 |
} catch (IOException e) { |
168 |
throw new PdbLoadError(e); |
169 |
} catch (PdbChainCodeNotFoundError e) { |
170 |
throw new PdbLoadError(e); |
171 |
} |
172 |
|
173 |
} |
174 |
|
175 |
public String[] getChains() throws PdbLoadError { |
176 |
TreeSet<String> chains = new TreeSet<String>(); |
177 |
|
178 |
try { |
179 |
fcif = new RandomAccessFile(cifFile,"r"); |
180 |
if (!fieldsTitlesRead) { |
181 |
readFieldsTitles(); |
182 |
} |
183 |
Long[] intPdbxPoly = loopelements2contentOffset.get(ids2elements.get(pdbxPolySeqId)); |
184 |
|
185 |
int recordCount=0; |
186 |
|
187 |
fcif.seek(intPdbxPoly[0]); |
188 |
while(fcif.getFilePointer()<intPdbxPoly[1]) { |
189 |
recordCount++; |
190 |
|
191 |
int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id"); |
192 |
int numberFields = ids2fieldsIdx.get(pdbxPolySeqId); |
193 |
String[] tokens = tokeniseFields(numberFields); |
194 |
if (tokens.length!=numberFields) { |
195 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+pdbxPolySeqId); |
196 |
} |
197 |
chains.add(tokens[pdbStrandIdIdx]); |
198 |
} |
199 |
fcif.close(); |
200 |
|
201 |
} catch (IOException e) { |
202 |
throw new PdbLoadError(e); |
203 |
} catch (CiffileFormatError e) { |
204 |
throw new PdbLoadError(e); |
205 |
} |
206 |
|
207 |
if (chains.isEmpty()) return null; |
208 |
|
209 |
String[] chainsArray = new String[chains.size()]; |
210 |
chains.toArray(chainsArray); |
211 |
return chainsArray; |
212 |
} |
213 |
|
214 |
public Integer[] getModels() throws PdbLoadError { |
215 |
TreeSet<Integer> models = new TreeSet<Integer>(); |
216 |
try { |
217 |
fcif = new RandomAccessFile(cifFile,"r"); |
218 |
if (!fieldsTitlesRead) { |
219 |
readFieldsTitles(); |
220 |
} |
221 |
Long[] intAtomSite = loopelements2contentOffset.get(ids2elements.get(atomSiteId)); |
222 |
|
223 |
int recordCount=0; |
224 |
|
225 |
fcif.seek(intAtomSite[0]); |
226 |
while(fcif.getFilePointer()<intAtomSite[1]) { |
227 |
recordCount++; |
228 |
|
229 |
int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num"); |
230 |
int numberFields = ids2fieldsIdx.get(atomSiteId); |
231 |
String[] tokens = tokeniseFields(numberFields); |
232 |
if (tokens.length!=numberFields) { |
233 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSiteId); |
234 |
} |
235 |
models.add(Integer.parseInt(tokens[pdbxPDBModelNumIdx])); |
236 |
} |
237 |
fcif.close(); |
238 |
|
239 |
} catch (IOException e) { |
240 |
throw new PdbLoadError(e); |
241 |
} catch (CiffileFormatError e) { |
242 |
throw new PdbLoadError(e); |
243 |
} |
244 |
|
245 |
if (models.isEmpty()) return null; |
246 |
|
247 |
Integer[] modelsArray = new Integer[models.size()]; |
248 |
models.toArray(modelsArray); |
249 |
return modelsArray; |
250 |
} |
251 |
|
252 |
/*---------------------------- private methods --------------------------*/ |
253 |
|
254 |
private void parseCifFile() throws IOException, CiffileFormatError, PdbChainCodeNotFoundError{ |
255 |
|
256 |
if (!fieldsTitlesRead) { |
257 |
readFieldsTitles(); |
258 |
} |
259 |
// now reading separate elements separately using private methods |
260 |
// the order in the elements in the file is not guaranteed, that's why (among other reasons) we have to use RandomAccessFile |
261 |
this.pdbCode = readPdbCode(); |
262 |
readAtomAltLocs(); // sets altLoc String (needed in readAtomSite to get the right alt atom locations) |
263 |
readPdbxPolySeq(); // sets chainCode, sequence, pdbresser2resser |
264 |
readAtomSite(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser |
265 |
secondaryStructure = new SecondaryStructure(this.sequence); // create empty secondary structure first to make sure object is not null |
266 |
readSecStructure(); // populates secondaryStructure |
267 |
|
268 |
} |
269 |
|
270 |
private void readFieldsTitles() throws IOException, CiffileFormatError { |
271 |
// data structures to store the parsed fields |
272 |
ids2elements = new TreeMap<String, Integer>(); |
273 |
fields2indices = new TreeMap<String,Integer>(); |
274 |
fields2values = new TreeMap<String, String>(); |
275 |
loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type |
276 |
loopelements2contentOffset = new TreeMap<Integer,Long[]>(); |
277 |
ids2fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id |
278 |
|
279 |
int element = 0; |
280 |
String line; |
281 |
line = fcif.readLine(); // read first line |
282 |
Pattern p = Pattern.compile("^data_\\d\\w\\w\\w"); |
283 |
if (!p.matcher(line).find()){ |
284 |
throw new CiffileFormatError("The file doesn't seem to be a cif file"); |
285 |
} |
286 |
int linecount = 1; // we have read one line already, we initialise count to 1 |
287 |
// we need to store the last line's byte offset (which indicates the beginning of this line) |
288 |
long lastLineOffset=fcif.getFilePointer(); |
289 |
while((line = fcif.readLine()) != null ) { |
290 |
long currentOffset = fcif.getFilePointer(); //this gets byte offset at end of line |
291 |
linecount++; |
292 |
if (line.startsWith("#")) { |
293 |
element++; |
294 |
continue; |
295 |
} |
296 |
if (line.startsWith("loop_")) { |
297 |
loopElements.add(element); |
298 |
continue; |
299 |
} |
300 |
|
301 |
for (String id:ids){ |
302 |
if (!ids2fieldsIdx.containsKey(id)) ids2fieldsIdx.put(id,0); |
303 |
p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$"); |
304 |
Matcher m = p.matcher(line); |
305 |
if (m.find()){ |
306 |
ids2elements.put(id,element); |
307 |
String field = id + "." + m.group(1); |
308 |
if (!loopElements.contains(element)) { // if not a loop element |
309 |
fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name |
310 |
} else { // for loop elements we fill the fields2indices TreeMap |
311 |
fields2indices.put(field,ids2fieldsIdx.get(id)); |
312 |
} |
313 |
ids2fieldsIdx.put(id,ids2fieldsIdx.get(id)+1); |
314 |
continue; |
315 |
} |
316 |
} |
317 |
if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element |
318 |
if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids) |
319 |
if (!loopelements2contentOffset.containsKey(element)) { |
320 |
//loopelements2content.put(element,line+"\n"); |
321 |
Long[] interval = {lastLineOffset, currentOffset}; |
322 |
loopelements2contentOffset.put(element,interval); |
323 |
} else { |
324 |
//loopelements2content.put(element,loopelements2content.get(element)+line+"\n"); |
325 |
loopelements2contentOffset.get(element)[1]=currentOffset; |
326 |
} |
327 |
} |
328 |
} |
329 |
lastLineOffset = currentOffset; //we store this line's offset to have it for next iteration |
330 |
} // end scanning lines |
331 |
|
332 |
fieldsTitlesRead = true; |
333 |
} |
334 |
|
335 |
private String readPdbCode(){ |
336 |
return fields2values.get(entryId+".id").trim().toLowerCase(); |
337 |
} |
338 |
|
339 |
private void readAtomAltLocs() throws IOException, CiffileFormatError { |
340 |
// The read of the atom_sites_alt element must be done previously to scanning the atom_site element |
341 |
// This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site |
342 |
// (and altLoc needs to be set before starting reading the atom_site element) |
343 |
|
344 |
ArrayList<String> altLocs = new ArrayList<String>(); |
345 |
// we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value |
346 |
altLoc = "."; |
347 |
|
348 |
// atom_sites_alt element is optional |
349 |
Long[] intAtomSitesAlt = null; |
350 |
if (ids2elements.containsKey(atomSitesAltId)){ |
351 |
intAtomSitesAlt = loopelements2contentOffset.get(ids2elements.get(atomSitesAltId)); |
352 |
} |
353 |
|
354 |
int recordCount = 0; |
355 |
// atom_sites_alt (optional element) |
356 |
if (intAtomSitesAlt!=null) { |
357 |
|
358 |
fcif.seek(intAtomSitesAlt[0]); |
359 |
while(fcif.getFilePointer()<intAtomSitesAlt[1]) { |
360 |
recordCount++; |
361 |
|
362 |
int idIdx = fields2indices.get(atomSitesAltId+".id"); |
363 |
// id=0 |
364 |
// A ? |
365 |
int numberFields = ids2fieldsIdx.get(atomSitesAltId); |
366 |
String[] tokens = tokeniseFields(numberFields); |
367 |
if (tokens.length!=numberFields) { |
368 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSitesAltId); |
369 |
} |
370 |
if (!tokens[idIdx].equals(".")) { |
371 |
altLocs.add(tokens[idIdx]); |
372 |
} |
373 |
} |
374 |
} |
375 |
if (!altLocs.isEmpty()){ |
376 |
altLoc = Collections.min(altLocs); |
377 |
} |
378 |
} |
379 |
|
380 |
private void readAtomSite() throws IOException, PdbChainCodeNotFoundError, CiffileFormatError { |
381 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
382 |
resser2restype = new HashMap<Integer,String>(); |
383 |
atomser2coord = new HashMap<Integer,Point3d>(); |
384 |
atomser2resser = new HashMap<Integer,Integer>(); |
385 |
|
386 |
Long[] intAtomSite = loopelements2contentOffset.get(ids2elements.get(atomSiteId)); |
387 |
|
388 |
boolean empty = true; |
389 |
int recordCount = 0; |
390 |
|
391 |
fcif.seek(intAtomSite[0]); |
392 |
while(fcif.getFilePointer()<intAtomSite[1]) { |
393 |
recordCount++; |
394 |
|
395 |
int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB"); |
396 |
int idIdx = fields2indices.get(atomSiteId+".id"); |
397 |
int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id"); |
398 |
int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id"); |
399 |
int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id"); |
400 |
int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id"); |
401 |
int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id"); |
402 |
int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x"); |
403 |
int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y"); |
404 |
int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z"); |
405 |
int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num"); |
406 |
// group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12 |
407 |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24 |
408 |
//ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1 |
409 |
int numberFields = ids2fieldsIdx.get(atomSiteId); |
410 |
String[] tokens = tokeniseFields(numberFields); |
411 |
if (tokens.length!=numberFields) { |
412 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSiteId); |
413 |
} |
414 |
if (tokens[groupPdbIdx].equals("ATOM") && tokens[labelAsymIdIdx].equals(chainCode) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model |
415 |
empty = false; |
416 |
if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc |
417 |
int atomserial=Integer.parseInt(tokens[idIdx]); // id |
418 |
String atom = tokens[labelAtomIdIdx]; // label_atom_id |
419 |
String res_type = tokens[labelCompIdIdx]; // label_comp_id |
420 |
int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id |
421 |
double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x |
422 |
double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y |
423 |
double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z |
424 |
Point3d coords = new Point3d(x,y,z); |
425 |
if (AAinfo.isValidAA(res_type)) { |
426 |
atomser2coord.put(atomserial, coords); |
427 |
atomser2resser.put(atomserial, res_serial); |
428 |
resser2restype.put(res_serial, res_type); |
429 |
if (AAinfo.isValidAtomWithOXT(res_type,atom)){ |
430 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
431 |
} |
432 |
} |
433 |
} |
434 |
} |
435 |
} |
436 |
if (empty) { // no atom data was found for given pdb chain code and model |
437 |
throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model); |
438 |
} |
439 |
} |
440 |
|
441 |
private void readPdbxPolySeq() throws IOException, CiffileFormatError { |
442 |
pdbresser2resser = new HashMap<String, Integer>(); |
443 |
sequence = ""; |
444 |
|
445 |
String chainCodeStr=pdbChainCode; |
446 |
if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) chainCodeStr="A"; |
447 |
|
448 |
Long[] intPdbxPoly = loopelements2contentOffset.get(ids2elements.get(pdbxPolySeqId)); |
449 |
|
450 |
int recordCount=0; |
451 |
|
452 |
fcif.seek(intPdbxPoly[0]); |
453 |
while(fcif.getFilePointer()<intPdbxPoly[1]) { |
454 |
recordCount++; |
455 |
|
456 |
int asymIdIdx = fields2indices.get(pdbxPolySeqId+".asym_id"); |
457 |
int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id"); |
458 |
//int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num"); |
459 |
int pdbSeqNumIdx = fields2indices.get(pdbxPolySeqId+".pdb_seq_num"); |
460 |
int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code"); |
461 |
int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id"); |
462 |
int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id"); |
463 |
// asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3 |
464 |
// 0 1 2 3 4 5 6 7 8 910 |
465 |
// A 1 1 ASP 1 1 1 ASP ASP A . |
466 |
int numberFields = ids2fieldsIdx.get(pdbxPolySeqId); |
467 |
String[] tokens = tokeniseFields(numberFields); |
468 |
if (tokens.length!=numberFields) { |
469 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+pdbxPolySeqId); |
470 |
} |
471 |
if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site) |
472 |
int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id |
473 |
chainCode = tokens[asymIdIdx]; |
474 |
String pdb_res_serial = tokens[pdbSeqNumIdx]; // pdb_seq_num |
475 |
String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code |
476 |
String pdb_res_serial_with_icode = pdb_res_serial; |
477 |
if (!pdb_ins_code.equals(".")) { |
478 |
pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code; |
479 |
} |
480 |
String res_type = tokens[monIdIdx]; // mon_id |
481 |
// sequence |
482 |
if (AAinfo.isValidAA(res_type)){ |
483 |
sequence+=AAinfo.threeletter2oneletter(res_type); |
484 |
} else { |
485 |
sequence+=AAinfo.NONSTANDARD_AA_ONE_LETTER; |
486 |
} |
487 |
// pdbresser2resser |
488 |
pdbresser2resser.put(pdb_res_serial_with_icode,res_serial); |
489 |
|
490 |
} |
491 |
} |
492 |
} |
493 |
|
494 |
private void readSecStructure() throws IOException, CiffileFormatError { |
495 |
secondaryStructure = new SecondaryStructure(this.sequence); |
496 |
|
497 |
// struct_conf element is optional |
498 |
Long[] intStructConf = null; |
499 |
if (ids2elements.containsKey(structConfId)) { |
500 |
// if not a loop element then intStructConf stays null (because loopelements2contentIndex will return null) |
501 |
intStructConf = loopelements2contentOffset.get(ids2elements.get(structConfId)); |
502 |
} |
503 |
// taking care of cases where struct_conf is not a loop element but a one value field |
504 |
if (ids2elements.containsKey(structConfId) && !loopElements.contains(ids2elements.get(structConfId))){ |
505 |
String begChainCode = fields2values.get(structConfId+".beg_label_asym_id").trim(); |
506 |
if (begChainCode.equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme |
507 |
String id = fields2values.get(structConfId+".id").trim(); |
508 |
int beg = Integer.parseInt(fields2values.get(structConfId+".beg_label_seq_id").trim()); |
509 |
int end = Integer.parseInt(fields2values.get(structConfId+".end_label_seq_id").trim()); |
510 |
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
511 |
Matcher m = p.matcher(id); |
512 |
String ssId="Unknown"; |
513 |
if (m.find()){ |
514 |
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
515 |
} |
516 |
char ssType = SecStrucElement.OTHER; |
517 |
if(id.startsWith("H")) { |
518 |
ssType = SecStrucElement.HELIX; |
519 |
} else if(id.startsWith("T")) { |
520 |
ssType = SecStrucElement.TURN; |
521 |
} else { |
522 |
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
523 |
} |
524 |
if(ssType != SecStrucElement.OTHER) { |
525 |
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
526 |
secondaryStructure.add(ssElem); |
527 |
} |
528 |
} |
529 |
} |
530 |
// struct_sheet_range element is optional |
531 |
Long[] intStructSheet = null; |
532 |
if (ids2elements.containsKey(structSheetId)) { |
533 |
// if not a loop element intStructSheet stays null (because loopelements2contentIndex will return null) |
534 |
intStructSheet = loopelements2contentOffset.get(ids2elements.get(structSheetId)); |
535 |
} |
536 |
// taking care of cases where struct_sheet_range is not a loop element but a one value field |
537 |
if (ids2elements.containsKey(structSheetId) && !loopElements.contains(ids2elements.get(structSheetId))){ |
538 |
String begChainCode = fields2values.get(structSheetId+".beg_label_asym_id").trim(); |
539 |
if (begChainCode.equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme |
540 |
String sheetid = fields2values.get(structSheetId+".sheet_id").trim(); //tokens[sheetIdIdx]; |
541 |
int id = Integer.parseInt(fields2values.get(structSheetId+".id").trim()); //Integer.parseInt(tokens[idIdx]); |
542 |
int beg = Integer.parseInt(fields2values.get(structSheetId+".beg_label_seq_id").trim()); //tokens[begLabelSeqIdIdx]); |
543 |
int end = Integer.parseInt(fields2values.get(structSheetId+".end_label_seq_id").trim()); //tokens[endLabelSeqIdIdx]); |
544 |
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
545 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
546 |
secondaryStructure.add(ssElem); |
547 |
} |
548 |
|
549 |
} |
550 |
|
551 |
if (intStructConf!=null) { |
552 |
int recordCount=0; |
553 |
|
554 |
fcif.seek(intStructConf[0]); |
555 |
while(fcif.getFilePointer()<intStructConf[1]) { |
556 |
recordCount++; |
557 |
// struct_conf (optional element), HELIX and TURN secondary structure |
558 |
|
559 |
int idIdx = fields2indices.get(structConfId+".id"); |
560 |
int begLabelAsymIdIdx = fields2indices.get(structConfId+".beg_label_asym_id"); |
561 |
int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id"); |
562 |
int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id"); |
563 |
//id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4 |
564 |
// 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19 |
565 |
//HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11 |
566 |
int numFields = ids2fieldsIdx.get(structConfId); |
567 |
String[] tokens = tokeniseFields(numFields); |
568 |
if (tokens.length!=numFields) { |
569 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structConfId); |
570 |
} |
571 |
if (tokens[begLabelAsymIdIdx].equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme |
572 |
String id = tokens[idIdx]; |
573 |
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
574 |
Matcher m = p.matcher(id); |
575 |
String ssId="Unknown"; |
576 |
if (m.find()){ |
577 |
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
578 |
} |
579 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
580 |
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
581 |
char ssType = SecStrucElement.OTHER; |
582 |
if(id.startsWith("H")) { |
583 |
ssType = SecStrucElement.HELIX; |
584 |
} else if(id.startsWith("T")) { |
585 |
ssType = SecStrucElement.TURN; |
586 |
} else { |
587 |
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
588 |
} |
589 |
if(ssType != SecStrucElement.OTHER) { |
590 |
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
591 |
secondaryStructure.add(ssElem); |
592 |
} |
593 |
} |
594 |
} |
595 |
} |
596 |
if (intStructSheet!=null) { |
597 |
int recordCount=0; |
598 |
|
599 |
fcif.seek(intStructSheet[0]); |
600 |
while(fcif.getFilePointer()<intStructSheet[1]) { |
601 |
recordCount++; |
602 |
// struct_sheet_range (optional element), SHEETs |
603 |
int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id"); |
604 |
int idIdx = fields2indices.get(structSheetId+".id"); |
605 |
int begLabelAsymIdIdx = fields2indices.get(structSheetId+".beg_label_asym_id"); |
606 |
int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id"); |
607 |
int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id"); |
608 |
//sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3 |
609 |
//0 1 2 3 4 5 6 7 8 910 1112 13 1415 16 |
610 |
//A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19 |
611 |
int numFields = ids2fieldsIdx.get(structSheetId); |
612 |
String[] tokens = tokeniseFields(numFields); |
613 |
if (tokens.length!=numFields) { |
614 |
throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structSheetId); |
615 |
} |
616 |
if (tokens[begLabelAsymIdIdx].equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme |
617 |
String sheetid = tokens[sheetIdIdx]; |
618 |
int id = Integer.parseInt(tokens[idIdx]); |
619 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
620 |
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
621 |
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
622 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
623 |
secondaryStructure.add(ssElem); |
624 |
} |
625 |
} |
626 |
} |
627 |
} |
628 |
|
629 |
/** |
630 |
* Splits a space separated line into its individual tokens returning an array with all tokens |
631 |
* Takes care of all particularities of the format of a record in the ciffiles: |
632 |
* - fields within records are separated by spaces |
633 |
* - spaces can be used within quoted strings (at the moment this only supports single quotes, not double) |
634 |
* - free style with all characters allowed if something is quoted with \n; ;\n |
635 |
* The java class StreamTokenizer could have done all this, but it was limited to do all that we needed to do |
636 |
* |
637 |
* |
638 |
* This method is black magic. I don't fully understand it myself as I write it. |
639 |
* If you need to come back to this and read it, good luck!! |
640 |
* |
641 |
* @param numberTokens |
642 |
* @return |
643 |
*/ |
644 |
private String[] tokeniseFields(int numberTokens) throws IOException { |
645 |
String[] tokens = new String[numberTokens]; |
646 |
// initialise tokens to empty strings |
647 |
for (int i=0; i<numberTokens;i++){ |
648 |
tokens[i]=""; |
649 |
} |
650 |
|
651 |
int i = 0; |
652 |
char lastChar=' '; |
653 |
char quoteChar = 0; |
654 |
while (true) { |
655 |
char currentChar = (char)fcif.readByte(); |
656 |
|
657 |
// '' quoting |
658 |
if (quoteChar!=';' && currentChar=='\'' && (lastChar==' ' || lastChar=='\n')){ |
659 |
quoteChar = '\''; |
660 |
} |
661 |
else if (quoteChar!=';' && currentChar==' ' && lastChar=='\''){ |
662 |
quoteChar = 0; |
663 |
} |
664 |
// "" quoting |
665 |
if (quoteChar!=';' && currentChar=='"' && (lastChar==' ' || lastChar=='\n')){ |
666 |
quoteChar = '"'; |
667 |
} |
668 |
else if (quoteChar!=';' && currentChar==' ' && lastChar=='"'){ |
669 |
quoteChar = 0; |
670 |
} |
671 |
// ;; quoting (multi-line quoting) |
672 |
if (quoteChar!=';' && currentChar==';' && lastChar=='\n'){ |
673 |
quoteChar = ';'; |
674 |
} |
675 |
else if (quoteChar==';' && currentChar==';' && lastChar=='\n'){ |
676 |
quoteChar = 0; |
677 |
} |
678 |
|
679 |
// reading field |
680 |
if (quoteChar==0) { // not within quotes |
681 |
if (currentChar==' ' || currentChar=='\n') { |
682 |
if (currentChar!=lastChar && !(currentChar=='\n' && lastChar==' ')) i++; // we only increment when we move from a non-space to a space or from non-space to \n |
683 |
} else { |
684 |
tokens[i]+=currentChar; |
685 |
// if we are adding the last ; of a ;;-quoted string then strip the starting ';' and ending "\n;" out |
686 |
if (currentChar==';' && lastChar=='\n' && tokens[i].startsWith(";") && tokens[i].endsWith("\n;")) { |
687 |
tokens[i]=tokens[i].replaceFirst("^;", ""); |
688 |
tokens[i]=tokens[i].replaceFirst("\n;",""); |
689 |
} |
690 |
} |
691 |
} else { // within quotes (of type '', "" or ;;) |
692 |
tokens[i]+=currentChar; |
693 |
// if string is surrounded by '' or "" then strip them out (except when string is length 1 and thus beginning and end are quoteChar) |
694 |
if (tokens[i].length()!=1 && tokens[i].startsWith(Character.toString(quoteChar)) && tokens[i].endsWith(Character.toString(quoteChar))) tokens[i]=tokens[i].replaceAll(Character.toString(quoteChar), ""); |
695 |
|
696 |
} |
697 |
|
698 |
lastChar = currentChar; |
699 |
|
700 |
if (i==numberTokens) { |
701 |
// for the last record of an element it is important to have read up to the end of the line (including the '\n'), |
702 |
// otherwise the condition : "while (current_pointer<max_pointer_of_this_element)" won't work |
703 |
// we read one more character at a time: test whether it is a ' ' or a '\n', if not then we have overread so we need to rewind back |
704 |
while (true) { |
705 |
long currentPos = fcif.getFilePointer(); // get current position to rewind back to it if needed |
706 |
currentChar = (char) fcif.readByte(); |
707 |
if (currentChar!='\n' && currentChar!=' '){ |
708 |
fcif.seek(currentPos); |
709 |
break; |
710 |
} |
711 |
} |
712 |
return tokens; |
713 |
} |
714 |
} |
715 |
} |
716 |
|
717 |
} |