ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/CiffilePdb.java
Revision: 355
Committed: Fri Oct 12 18:42:37 2007 UTC (17 years ago) by duarte
File size: 28315 byte(s)
Log Message:
FIXED BUG: now doesn't fail with records that are delimited with \n; ;\n
Method tokeniseFields is now completely rewritten: is what does all the magic of parsing all the oddities of the mmcif format
Using RandomAccessFile to open the file only once and then seek to the positions we need to scan at each point. Might be slower due to the RandomAccessFile that does no buffering. Also maybe because the new tokenisation is not very optimal
Now parseCifFile does the whole parsing calling also the submethods instead of calling them in the constructor
Line File contents
1 package proteinstructure;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.io.FileOutputStream;
6 import java.io.InputStream;
7 import java.io.RandomAccessFile;
8
9 import java.io.IOException;
10 import java.net.URL;
11 import java.net.URLConnection;
12 import java.util.ArrayList;
13 import java.util.Collections;
14 import java.util.HashMap;
15 import java.util.TreeMap;
16 import java.util.TreeSet;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19 import java.util.zip.GZIPInputStream;
20
21 import javax.vecmath.Point3d;
22
23
24 /**
25 * A single chain pdb protein structure loaded from an mmCIF file or downloaded from the PDB FTP site
26 *
27 * @author Jose Duarte
28 */
29 public class CiffilePdb extends Pdb {
30
31 /*------------------------------ constants ------------------------------*/
32 public static final String PDB_FTP_URL = "ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/mmCIF/";
33 public static final String CIF_FILE_EXTENSION = ".cif.gz";
34
35 /*--------------------------- member variables --------------------------*/
36
37 // input file
38 private File cifFile;
39
40 // fields we will read
41 private static final String entryId = "_entry";
42 private static final String atomSiteId = "_atom_site";
43 private static final String atomSitesAltId = "_atom_sites_alt";
44 private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme";
45 private static final String structConfId = "_struct_conf";
46 private static final String structSheetId = "_struct_sheet_range";
47 private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId};
48
49 private TreeMap<String,Integer> ids2elements; // map of ids to element serials
50 private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements)
51 private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements)
52 private TreeMap<String,Integer> ids2fieldsIdx; // map of element ids to field index counter (after parseCifFile method done it contains the total number of fields per element id)
53 private TreeSet<Integer> loopElements; // contains list of elements that are of loop type
54 private TreeMap<Integer,Long[]> loopelements2contentOffset; // begin and end line index of each loop element
55
56 private String altLoc;
57
58 private RandomAccessFile fcif;
59
60 /*----------------------------- constructors ----------------------------*/
61
62 /**
63 * Constructs Pdb object from online PDB given pdb code and pdb chain code.
64 * The DEFAULT_MODEL (see superclass) and default PDB_FTP_URL are used.
65 * @param pdbCode
66 * @param pdbChainCode
67 * @throws PdbChainCodeNotFoundError
68 * @throws IOException
69 * @throws CiffileFormatError
70 */
71 public CiffilePdb(String pdbCode, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
72 this(pdbCode, pdbChainCode, DEFAULT_MODEL, PDB_FTP_URL);
73 }
74
75 /**
76 * Constructs Pdb object from online PDB given pdb code, pdb chain code and model serial.
77 * The default PDB_FTP_URL is used.
78 * @param pdbCode
79 * @param pdbChainCode
80 * @param model_serial
81 * @throws PdbChainCodeNotFoundError
82 * @throws IOException
83 * @throws CiffileFormatError
84 */
85 public CiffilePdb(String pdbCode, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
86 this(pdbCode, pdbChainCode, model_serial, PDB_FTP_URL);
87 }
88
89 /**
90 * Constructs Pdb object from online PDB given pdb code, pdb chain code and pdbFtpUrl.
91 * Model will be DEFAULT_MODEL (see superclass).
92 * @param pdbCode
93 * @param pdbChainCode
94 * @param pdbFtpUrl
95 * @throws PdbChainCodeNotFoundError
96 * @throws IOException
97 * @throws CiffileFormatError
98 */
99 public CiffilePdb (String pdbCode, String pdbChainCode, String pdbFtpUrl) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
100 this(pdbCode, pdbChainCode, DEFAULT_MODEL, pdbFtpUrl);
101 }
102
103 /**
104 * Constructs Pdb object from online PDB given pdb code, pdb chain code, model serial and pdbFtpUrl
105 * @param pdbCode
106 * @param pdbChainCode
107 * @param model_serial
108 * @param pdbFtpUrl
109 * @throws PdbChainCodeNotFoundError
110 * @throws IOException
111 * @throws CiffileFormatError
112 */
113 public CiffilePdb (String pdbCode, String pdbChainCode, int model_serial, String pdbFtpUrl) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
114 String tempDir = System.getProperty("java.io.tmpdir"); // TODO: read from FTP directly
115 String gzCifFileName = pdbCode+CIF_FILE_EXTENSION;
116 File gzCifFile = new File(tempDir,gzCifFileName);
117 gzCifFile.deleteOnExit();
118 this.cifFile = new File(tempDir,pdbCode + ".cif");
119 this.cifFile.deleteOnExit();
120
121 // getting gzipped cif file from ftp
122 URL url = new URL(pdbFtpUrl+gzCifFileName);
123 URLConnection urlc = url.openConnection();
124 InputStream is = urlc.getInputStream();
125 FileOutputStream os = new FileOutputStream(gzCifFile);
126 int b;
127 while ( (b=is.read())!=-1) {
128 os.write(b);
129 }
130 is.close();
131 os.close();
132
133 // unzipping downloaded file
134 GZIPInputStream zis = new GZIPInputStream(new FileInputStream(gzCifFile));
135 os = new FileOutputStream(cifFile);
136 while ( (b=zis.read())!=-1) {
137 os.write(b);
138 }
139 zis.close();
140 os.close();
141
142 // here we would like to call the constructor this(ciffile, pdbChainCode, model_serial); which does not work, so we use copy/paste:
143
144 // load from temp file
145 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
146 this.model=model_serial;
147
148 fcif = new RandomAccessFile(cifFile,"r");
149 parseCifFile();
150 fcif.close();
151
152 this.fullLength = sequence.length();
153 this.obsLength = resser2restype.size();
154 if(!secondaryStructure.isEmpty()) {
155 secondaryStructure.setComment("CIFfile");
156 }
157
158 // we initialise resser2pdbresser from the pdbresser2resser HashMap
159 this.resser2pdbresser = new HashMap<Integer, String>();
160 for (String pdbresser:pdbresser2resser.keySet()){
161 resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
162 }
163
164 // initialising atomser2atom from resser_atom2atomserial
165 atomser2atom = new HashMap<Integer, String>();
166 for (String resser_atom:resser_atom2atomserial.keySet()){
167 int atomserial = resser_atom2atomserial.get(resser_atom);
168 String atom = resser_atom.split("_")[1];
169 atomser2atom.put(atomserial,atom);
170 }
171 }
172
173 /**
174 * Constructs Pdb object given cif file and pdb chain code.
175 * Model will be DEFAULT_MODEL
176 * @param ciffile
177 * @param pdbChainCode
178 * @throws PdbChainCodeNotFoundError
179 * @throws IOException
180 * @throws CiffileFormatError
181 */
182 public CiffilePdb (File ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
183 this(ciffile, pdbChainCode, DEFAULT_MODEL);
184 }
185
186 /**
187 * Constructs Pdb object given cif file, pdb chain code and model serial
188 * @param ciffile
189 * @param pdbChainCode
190 * @param model_serial
191 * @throws PdbChainCodeNotFoundError
192 * @throws IOException
193 * @throws CiffileFormatError
194 */
195 public CiffilePdb (File ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
196 this.cifFile = ciffile;
197 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
198 this.model=model_serial;
199
200 fcif = new RandomAccessFile(cifFile,"r");
201 parseCifFile();
202 fcif.close();
203
204 this.fullLength = sequence.length();
205 this.obsLength = resser2restype.size();
206 if(!secondaryStructure.isEmpty()) {
207 secondaryStructure.setComment("CIFfile");
208 }
209
210 // we initialise resser2pdbresser from the pdbresser2resser HashMap
211 this.resser2pdbresser = new HashMap<Integer, String>();
212 for (String pdbresser:pdbresser2resser.keySet()){
213 resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
214 }
215
216 // initialising atomser2atom from resser_atom2atomserial
217 atomser2atom = new HashMap<Integer, String>();
218 for (String resser_atom:resser_atom2atomserial.keySet()){
219 int atomserial = resser_atom2atomserial.get(resser_atom);
220 String atom = resser_atom.split("_")[1];
221 atomser2atom.put(atomserial,atom);
222 }
223 }
224
225 /*---------------------------- private methods --------------------------*/
226
227 private void parseCifFile() throws IOException, CiffileFormatError, PdbChainCodeNotFoundError{
228 // data structures to store the parsed fields
229 ids2elements = new TreeMap<String, Integer>();
230 fields2indices = new TreeMap<String,Integer>();
231 fields2values = new TreeMap<String, String>();
232 loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type
233 loopelements2contentOffset = new TreeMap<Integer,Long[]>();
234 ids2fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id
235
236 int element = 0;
237 String line;
238 line = fcif.readLine(); // read first line
239 Pattern p = Pattern.compile("^data_\\d\\w\\w\\w");
240 if (!p.matcher(line).find()){
241 throw new CiffileFormatError("The file doesn't seem to be a cif file");
242 }
243 int linecount = 1; // we have read one line already, we initialise count to 1
244 // we need to store the last line's byte offset (which indicates the beginning of this line)
245 long lastLineOffset=fcif.getFilePointer();
246 while((line = fcif.readLine()) != null ) {
247 long currentOffset = fcif.getFilePointer(); //this gets byte offset at end of line
248 linecount++;
249 if (line.startsWith("#")) {
250 element++;
251 continue;
252 }
253 if (line.startsWith("loop_")) {
254 loopElements.add(element);
255 continue;
256 }
257
258 for (String id:ids){
259 if (!ids2fieldsIdx.containsKey(id)) ids2fieldsIdx.put(id,0);
260 p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$");
261 Matcher m = p.matcher(line);
262 if (m.find()){
263 ids2elements.put(id,element);
264 String field = id + "." + m.group(1);
265 if (!loopElements.contains(element)) { // if not a loop element
266 fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name
267 } else { // for loop elements we fill the fields2indices TreeMap
268 fields2indices.put(field,ids2fieldsIdx.get(id));
269 }
270 ids2fieldsIdx.put(id,ids2fieldsIdx.get(id)+1);
271 continue;
272 }
273 }
274 if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element
275 if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids)
276 if (!loopelements2contentOffset.containsKey(element)) {
277 //loopelements2content.put(element,line+"\n");
278 Long[] interval = {lastLineOffset, currentOffset};
279 loopelements2contentOffset.put(element,interval);
280 } else {
281 //loopelements2content.put(element,loopelements2content.get(element)+line+"\n");
282 loopelements2contentOffset.get(element)[1]=currentOffset;
283 }
284 }
285 }
286 lastLineOffset = currentOffset; //we store this line's offset to have it for next iteration
287 } // end scanning lines
288
289 // now reading separate elements separately using private methods
290 // the order in the elements in the file is not guaranteed, that's why (among other reasons) we have to use RandomAccessFile
291 this.pdbCode = readPdbCode();
292 readAtomAltLocs(); // sets altLoc String (needed in readAtomSite to get the right alt atom locations)
293 readPdbxPolySeq(); // sets chainCode, sequence, pdbresser2resser
294 readAtomSite(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser
295 secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null
296 readSecStructure(); // populates secondaryStructure
297
298 }
299
300 private String readPdbCode(){
301 return fields2values.get(entryId+".id").trim();
302 }
303
304 private void readAtomAltLocs() throws IOException, CiffileFormatError {
305 // The read of the atom_sites_alt element must be done previously to scanning the atom_site element
306 // This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site
307 // (and altLoc needs to be set before starting reading the atom_site element)
308
309 ArrayList<String> altLocs = new ArrayList<String>();
310 // we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value
311 altLoc = ".";
312
313 // atom_sites_alt element is optional
314 Long[] intAtomSitesAlt = null;
315 if (ids2elements.containsKey(atomSitesAltId)){
316 intAtomSitesAlt = loopelements2contentOffset.get(ids2elements.get(atomSitesAltId));
317 }
318
319 int recordCount = 0;
320 // atom_sites_alt (optional element)
321 if (intAtomSitesAlt!=null) {
322
323 fcif.seek(intAtomSitesAlt[0]);
324 while(fcif.getFilePointer()<intAtomSitesAlt[1]) {
325 recordCount++;
326
327 int idIdx = fields2indices.get(atomSitesAltId+".id");
328 // id=0
329 // A ?
330 int numberFields = ids2fieldsIdx.get(atomSitesAltId);
331 String[] tokens = tokeniseFields(numberFields);
332 if (tokens.length!=numberFields) {
333 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSitesAltId);
334 }
335 if (!tokens[idIdx].equals(".")) {
336 altLocs.add(tokens[idIdx]);
337 }
338 }
339 }
340 if (!altLocs.isEmpty()){
341 altLoc = Collections.min(altLocs);
342 }
343 }
344
345 private void readAtomSite() throws IOException, PdbChainCodeNotFoundError, CiffileFormatError {
346 resser_atom2atomserial = new HashMap<String,Integer>();
347 resser2restype = new HashMap<Integer,String>();
348 atomser2coord = new HashMap<Integer,Point3d>();
349 atomser2resser = new HashMap<Integer,Integer>();
350
351 Long[] intAtomSite = loopelements2contentOffset.get(ids2elements.get(atomSiteId));
352
353 boolean empty = true;
354 int recordCount = 0;
355
356 fcif.seek(intAtomSite[0]);
357 while(fcif.getFilePointer()<intAtomSite[1]) {
358 recordCount++;
359
360 int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB");
361 int idIdx = fields2indices.get(atomSiteId+".id");
362 int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id");
363 int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id");
364 int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id");
365 int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id");
366 int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id");
367 int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x");
368 int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y");
369 int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z");
370 int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num");
371 // group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12
372 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24
373 //ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1
374 int numberFields = ids2fieldsIdx.get(atomSiteId);
375 String[] tokens = tokeniseFields(numberFields);
376 if (tokens.length!=numberFields) {
377 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSiteId);
378 }
379 if (tokens[groupPdbIdx].equals("ATOM") && tokens[labelAsymIdIdx].equals(chainCode) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model
380 empty = false;
381 if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc
382 int atomserial=Integer.parseInt(tokens[idIdx]); // id
383 String atom = tokens[labelAtomIdIdx]; // label_atom_id
384 String res_type = tokens[labelCompIdIdx]; // label_comp_id
385 int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id
386 double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x
387 double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y
388 double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z
389 Point3d coords = new Point3d(x,y,z);
390 if (AAinfo.isValidAA(res_type)) {
391 atomser2coord.put(atomserial, coords);
392 atomser2resser.put(atomserial, res_serial);
393 resser2restype.put(res_serial, res_type);
394 if (AAinfo.isValidAtomWithOXT(res_type,atom)){
395 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
396 }
397 }
398 }
399 }
400 }
401 if (empty) { // no atom data was found for given pdb chain code and model
402 throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model);
403 }
404 }
405
406 private void readPdbxPolySeq() throws IOException, CiffileFormatError {
407 pdbresser2resser = new HashMap<String, Integer>();
408 sequence = "";
409
410 String chainCodeStr=pdbChainCode;
411 if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) chainCodeStr="A";
412
413 Long[] intPdbxPoly = loopelements2contentOffset.get(ids2elements.get(pdbxPolySeqId));
414
415 int recordCount=0;
416
417 fcif.seek(intPdbxPoly[0]);
418 while(fcif.getFilePointer()<intPdbxPoly[1]) {
419 recordCount++;
420
421 int asymIdIdx = fields2indices.get(pdbxPolySeqId+".asym_id");
422 int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id");
423 int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num");
424 int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code");
425 int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id");
426 int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id");
427 // asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3
428 // 0 1 2 3 4 5 6 7 8 910
429 // A 1 1 ASP 1 1 1 ASP ASP A .
430 int numberFields = ids2fieldsIdx.get(pdbxPolySeqId);
431 String[] tokens = tokeniseFields(numberFields);
432 if (tokens.length!=numberFields) {
433 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+pdbxPolySeqId);
434 }
435 if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site)
436 int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id
437 chainCode = tokens[asymIdIdx];
438 //TODO revise: do we want auth_seq_num or pdb_seq_num here??
439 String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num
440 String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code
441 String pdb_res_serial_with_icode = pdb_res_serial;
442 if (!pdb_ins_code.equals(".")) {
443 pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code;
444 }
445 String res_type = tokens[monIdIdx]; // mon_id
446 // sequence
447 if (AAinfo.isValidAA(res_type)){
448 sequence+=AAinfo.threeletter2oneletter(res_type);
449 } else {
450 sequence+=NONSTANDARD_AA_LETTER;
451 }
452 // pdbresser2resser
453 if (!pdb_res_serial_with_icode.startsWith("?")) { // question marks are author missing serials, we don't want them in the map
454 pdbresser2resser.put(pdb_res_serial_with_icode,res_serial);
455 }
456 }
457 }
458 }
459
460 private void readSecStructure() throws IOException, CiffileFormatError {
461 secondaryStructure = new SecondaryStructure();
462
463 // struct_conf element is optional
464 Long[] intStructConf = null;
465 if (ids2elements.containsKey(structConfId)) {
466 // if not a loop element then intStructConf stays null (because loopelements2contentIndex will return null)
467 intStructConf = loopelements2contentOffset.get(ids2elements.get(structConfId));
468 }
469 // taking care of cases where struct_conf is not a loop element but a one value field
470 if (ids2elements.containsKey(structConfId) && !loopElements.contains(ids2elements.get(structConfId))){
471 String begChainCode = fields2values.get(structConfId+".beg_label_asym_id").trim();
472 if (begChainCode.equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme
473 String id = fields2values.get(structConfId+".id").trim();
474 int beg = Integer.parseInt(fields2values.get(structConfId+".beg_label_seq_id").trim());
475 int end = Integer.parseInt(fields2values.get(structConfId+".end_label_seq_id").trim());
476 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
477 Matcher m = p.matcher(id);
478 String ssId="Unknown";
479 if (m.find()){
480 ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
481 }
482 char ssType = SecStrucElement.OTHER;
483 if(id.startsWith("H")) {
484 ssType = SecStrucElement.HELIX;
485 } else if(id.startsWith("T")) {
486 ssType = SecStrucElement.TURN;
487 } else {
488 System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
489 }
490 if(ssType != SecStrucElement.OTHER) {
491 SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
492 secondaryStructure.add(ssElem);
493 }
494 }
495 }
496 // struct_sheet_range element is optional
497 Long[] intStructSheet = null;
498 if (ids2elements.containsKey(structSheetId)) {
499 // if not a loop element intStructSheet stays null (because loopelements2contentIndex will return null)
500 intStructSheet = loopelements2contentOffset.get(ids2elements.get(structSheetId));
501 }
502 // taking care of cases where struct_sheet_range is not a loop element but a one value field
503 if (ids2elements.containsKey(structSheetId) && !loopElements.contains(ids2elements.get(structSheetId))){
504 String begChainCode = fields2values.get(structSheetId+".beg_label_asym_id").trim();
505 if (begChainCode.equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme
506 String sheetid = fields2values.get(structSheetId+".sheet_id").trim(); //tokens[sheetIdIdx];
507 int id = Integer.parseInt(fields2values.get(structSheetId+".id").trim()); //Integer.parseInt(tokens[idIdx]);
508 int beg = Integer.parseInt(fields2values.get(structSheetId+".beg_label_seq_id").trim()); //tokens[begLabelSeqIdIdx]);
509 int end = Integer.parseInt(fields2values.get(structSheetId+".end_label_seq_id").trim()); //tokens[endLabelSeqIdIdx]);
510 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
511 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
512 secondaryStructure.add(ssElem);
513 }
514
515 }
516
517 if (intStructConf!=null) {
518 int recordCount=0;
519
520 fcif.seek(intStructConf[0]);
521 while(fcif.getFilePointer()<intStructConf[1]) {
522 recordCount++;
523 // struct_conf (optional element), HELIX and TURN secondary structure
524
525 int idIdx = fields2indices.get(structConfId+".id");
526 int begLabelAsymIdIdx = fields2indices.get(structConfId+".beg_label_asym_id");
527 int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id");
528 int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id");
529 //id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4
530 // 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19
531 //HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11
532 int numFields = ids2fieldsIdx.get(structConfId);
533 String[] tokens = tokeniseFields(numFields);
534 if (tokens.length!=numFields) {
535 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structConfId);
536 }
537 if (tokens[begLabelAsymIdIdx].equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme
538 String id = tokens[idIdx];
539 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
540 Matcher m = p.matcher(id);
541 String ssId="Unknown";
542 if (m.find()){
543 ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
544 }
545 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
546 int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
547 char ssType = SecStrucElement.OTHER;
548 if(id.startsWith("H")) {
549 ssType = SecStrucElement.HELIX;
550 } else if(id.startsWith("T")) {
551 ssType = SecStrucElement.TURN;
552 } else {
553 System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
554 }
555 if(ssType != SecStrucElement.OTHER) {
556 SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
557 secondaryStructure.add(ssElem);
558 }
559 }
560 }
561 }
562 if (intStructSheet!=null) {
563 int recordCount=0;
564
565 fcif.seek(intStructSheet[0]);
566 while(fcif.getFilePointer()<intStructSheet[1]) {
567 recordCount++;
568 // struct_sheet_range (optional element), SHEETs
569 int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id");
570 int idIdx = fields2indices.get(structSheetId+".id");
571 int begLabelAsymIdIdx = fields2indices.get(structSheetId+".beg_label_asym_id");
572 int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id");
573 int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id");
574 //sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3
575 //0 1 2 3 4 5 6 7 8 910 1112 13 1415 16
576 //A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19
577 int numFields = ids2fieldsIdx.get(structSheetId);
578 String[] tokens = tokeniseFields(numFields);
579 if (tokens.length!=numFields) {
580 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structSheetId);
581 }
582 if (tokens[begLabelAsymIdIdx].equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme
583 String sheetid = tokens[sheetIdIdx];
584 int id = Integer.parseInt(tokens[idIdx]);
585 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
586 int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
587 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
588 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
589 secondaryStructure.add(ssElem);
590 }
591 }
592 }
593 }
594
595 /**
596 * Splits a space separated line into its individual tokens returning an array with all tokens
597 * Takes care of all particularities of the format of a record in the ciffiles:
598 * - fields within records are separated by spaces
599 * - spaces can be used within quoted strings (at the moment this only supports single quotes, not double)
600 * - free style with all characters allowed if something is quoted with \n; ;\n
601 * The java class StreamTokenizer could have done all this, but it was limited to do all that we needed to do
602 *
603 *
604 * This method is black magic. I don't even try to understand as I write it.
605 * If you need to come back to this and read it, good luck!!
606 *
607 * @param numberTokens
608 * @return
609 */
610 private String[] tokeniseFields(int numberTokens) throws IOException {
611 String[] tokens = new String[numberTokens];
612 // initialise tokens to empty strings
613 for (int i=0; i<numberTokens;i++){
614 tokens[i]="";
615 }
616
617 int i = 0;
618 char lastChar=' ';
619 boolean withinQuotes=false;
620 while (true) {
621 char currentChar = (char)fcif.readByte();
622 if ((currentChar=='\'' && lastChar==' ') || (!withinQuotes && currentChar==';' && lastChar=='\n')){
623 withinQuotes = true;
624 }
625 else if ((currentChar==' ' && lastChar=='\'') || (currentChar=='\n' && lastChar==';')){
626 withinQuotes = false;
627 }
628
629 if (!withinQuotes) {
630 if (currentChar==' ' || currentChar=='\n') {
631 if (currentChar!=lastChar && !(currentChar=='\n' && lastChar==' ')) i++; // we only increment when we move from a non-space to a space or from non-space to \n
632 } else {
633 tokens[i]+=currentChar;
634 }
635 } else {
636 tokens[i]+=currentChar;
637 // if string is surrounded by '' then strip them out (except when string is length 1 and thus beginning and end are \')
638 if (tokens[i].length()!=1 && tokens[i].startsWith("'") && tokens[i].endsWith("'")) tokens[i]=tokens[i].replaceAll("'", "");
639 // if string is surrounded by ;; then strip them out including a \n before the last ;
640 if (tokens[i].startsWith(";") && tokens[i].endsWith("\n;")) {
641 tokens[i]=tokens[i].replaceFirst("^;", "");
642 tokens[i]=tokens[i].replaceFirst("\n;","");
643 }
644 }
645
646 lastChar = currentChar;
647
648 if (i==numberTokens) {
649 fcif.readByte();// read one more byte: the space at the end of the line that otherwise we miss
650 return tokens;
651 }
652 }
653 }
654
655 }