ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/CiffilePdb.java
Revision: 639
Committed: Thu May 15 16:19:58 2008 UTC (16 years, 5 months ago) by duarte
File size: 29594 byte(s)
Log Message:
Implemented parsing of PsiPred secondary structure prediction files in SecondaryStructure class. 
New member in SecondaryStructure: sequence. Constructor must pass the sequence now, changed all references.
New method in Alignment to print a "graphical" matching of secondary structures.
Line File contents
1 package proteinstructure;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.io.FileOutputStream;
6 import java.io.InputStream;
7 import java.io.RandomAccessFile;
8
9 import java.io.IOException;
10 import java.net.URL;
11 import java.net.URLConnection;
12 import java.util.ArrayList;
13 import java.util.Collections;
14 import java.util.HashMap;
15 import java.util.TreeMap;
16 import java.util.TreeSet;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19 import java.util.zip.GZIPInputStream;
20
21 import javax.vecmath.Point3d;
22
23
24 /**
25 * A single chain pdb protein structure loaded from an mmCIF file or downloaded from the PDB FTP site
26 *
27 * @author Jose Duarte
28 */
29 public class CiffilePdb extends Pdb {
30
31 /*------------------------------ constants ------------------------------*/
32 public static final String PDB_FTP_URL = "ftp://ftp.wwpdb.org/pub/pdb/data/structures/all/mmCIF/";
33 public static final String CIF_FILE_EXTENSION = ".cif.gz";
34
35 /*--------------------------- member variables --------------------------*/
36
37 // input file
38 private File cifFile;
39
40 // fields we will read
41 private static final String entryId = "_entry";
42 private static final String atomSiteId = "_atom_site";
43 private static final String atomSitesAltId = "_atom_sites_alt";
44 private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme";
45 private static final String structConfId = "_struct_conf";
46 private static final String structSheetId = "_struct_sheet_range";
47 private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId};
48
49 private TreeMap<String,Integer> ids2elements; // map of ids to element serials
50 private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements)
51 private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements)
52 private TreeMap<String,Integer> ids2fieldsIdx; // map of element ids to field index counter (after parseCifFile method done it contains the total number of fields per element id)
53 private TreeSet<Integer> loopElements; // contains list of elements that are of loop type
54 private TreeMap<Integer,Long[]> loopelements2contentOffset; // begin and end line index of each loop element
55
56 private String altLoc;
57
58 private RandomAccessFile fcif;
59
60 private boolean fieldsTitlesRead;
61
62 /*----------------------------- constructors ----------------------------*/
63
64 /**
65 * Constructs an empty Pdb object from online PDB given pdb code
66 * Data will be downloaded an stored in local file
67 * but will only be loaded from local file upon call of load(pdbChainCode, modelSerial)
68 * The default PDB_FTP_URL is used.
69 * @param pdbCode
70 * @throws IOException
71 */
72 public CiffilePdb(String pdbCode) throws IOException {
73 this(pdbCode, PDB_FTP_URL);
74 }
75
76 /**
77 * Constructs an empty Pdb object from online PDB given pdb code and pdbFtpUrl
78 * Data will be downloaded an stored in local file
79 * but will only be loaded from local file upon call of load(pdbChainCode, modelSerial)
80 * @param pdbCode
81 * @param pdbFtpUrl
82 * @throws IOException
83 */
84 public CiffilePdb (String pdbCode, String pdbFtpUrl) throws IOException {
85 this.dataLoaded = false;
86 this.fieldsTitlesRead = false;
87
88 // we store the file locally instead of reading directly from the ftp stream, so that the file can be cached locally in applications like CMView
89 String tempDir = System.getProperty("java.io.tmpdir");
90 String gzCifFileName = pdbCode+CIF_FILE_EXTENSION;
91 File gzCifFile = new File(tempDir,gzCifFileName);
92 gzCifFile.deleteOnExit();
93 this.cifFile = new File(tempDir,pdbCode + ".cif");
94 this.cifFile.deleteOnExit();
95
96 // getting gzipped cif file from ftp
97 URL url = new URL(pdbFtpUrl+gzCifFileName);
98 URLConnection urlc = url.openConnection();
99 InputStream is = urlc.getInputStream();
100 FileOutputStream os = new FileOutputStream(gzCifFile);
101 int b;
102 while ( (b=is.read())!=-1) {
103 os.write(b);
104 }
105 is.close();
106 os.close();
107
108 // unzipping downloaded file
109 GZIPInputStream zis = new GZIPInputStream(new FileInputStream(gzCifFile));
110 os = new FileOutputStream(cifFile);
111 while ( (b=zis.read())!=-1) {
112 os.write(b);
113 }
114 zis.close();
115 os.close();
116
117 }
118
119 /**
120 * Constructs an empty Pdb object given cif file
121 * Data will be loaded from file upon call of load(pdbChainCode, modelSerial)
122 * @param ciffile
123 */
124 public CiffilePdb (File ciffile) {
125 this.cifFile = ciffile;
126 this.dataLoaded = false;
127 this.fieldsTitlesRead = false;
128
129 }
130
131 public File getCifFile() {
132 return cifFile;
133 }
134
135 public void load(String pdbChainCode, int modelSerial) throws PdbLoadError{
136 try {
137 this.model = modelSerial;
138 this.pdbChainCode=pdbChainCode; // NOTE! pdb chain codes are case sensitive
139 fcif = new RandomAccessFile(cifFile,"r");
140 parseCifFile();
141 fcif.close();
142
143 this.fullLength = sequence.length();
144 this.obsLength = resser2restype.size();
145 if(!secondaryStructure.isEmpty()) {
146 secondaryStructure.setComment("CIFfile");
147 }
148
149 // we initialise resser2pdbresser from the pdbresser2resser HashMap
150 this.resser2pdbresser = new HashMap<Integer, String>();
151 for (String pdbresser:pdbresser2resser.keySet()){
152 resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
153 }
154
155 // initialising atomser2atom from resser_atom2atomserial
156 atomser2atom = new HashMap<Integer, String>();
157 for (String resser_atom:resser_atom2atomserial.keySet()){
158 int atomserial = resser_atom2atomserial.get(resser_atom);
159 String atom = resser_atom.split("_")[1];
160 atomser2atom.put(atomserial,atom);
161 }
162
163 dataLoaded = true;
164
165 } catch (CiffileFormatError e) {
166 throw new PdbLoadError(e);
167 } catch (IOException e) {
168 throw new PdbLoadError(e);
169 } catch (PdbChainCodeNotFoundError e) {
170 throw new PdbLoadError(e);
171 }
172
173 }
174
175 public String[] getChains() throws PdbLoadError {
176 TreeSet<String> chains = new TreeSet<String>();
177
178 try {
179 fcif = new RandomAccessFile(cifFile,"r");
180 if (!fieldsTitlesRead) {
181 readFieldsTitles();
182 }
183 Long[] intPdbxPoly = loopelements2contentOffset.get(ids2elements.get(pdbxPolySeqId));
184
185 int recordCount=0;
186
187 fcif.seek(intPdbxPoly[0]);
188 while(fcif.getFilePointer()<intPdbxPoly[1]) {
189 recordCount++;
190
191 int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id");
192 int numberFields = ids2fieldsIdx.get(pdbxPolySeqId);
193 String[] tokens = tokeniseFields(numberFields);
194 if (tokens.length!=numberFields) {
195 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+pdbxPolySeqId);
196 }
197 chains.add(tokens[pdbStrandIdIdx]);
198 }
199 fcif.close();
200
201 } catch (IOException e) {
202 throw new PdbLoadError(e);
203 } catch (CiffileFormatError e) {
204 throw new PdbLoadError(e);
205 }
206
207 if (chains.isEmpty()) return null;
208
209 String[] chainsArray = new String[chains.size()];
210 chains.toArray(chainsArray);
211 return chainsArray;
212 }
213
214 public Integer[] getModels() throws PdbLoadError {
215 TreeSet<Integer> models = new TreeSet<Integer>();
216 try {
217 fcif = new RandomAccessFile(cifFile,"r");
218 if (!fieldsTitlesRead) {
219 readFieldsTitles();
220 }
221 Long[] intAtomSite = loopelements2contentOffset.get(ids2elements.get(atomSiteId));
222
223 int recordCount=0;
224
225 fcif.seek(intAtomSite[0]);
226 while(fcif.getFilePointer()<intAtomSite[1]) {
227 recordCount++;
228
229 int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num");
230 int numberFields = ids2fieldsIdx.get(atomSiteId);
231 String[] tokens = tokeniseFields(numberFields);
232 if (tokens.length!=numberFields) {
233 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSiteId);
234 }
235 models.add(Integer.parseInt(tokens[pdbxPDBModelNumIdx]));
236 }
237 fcif.close();
238
239 } catch (IOException e) {
240 throw new PdbLoadError(e);
241 } catch (CiffileFormatError e) {
242 throw new PdbLoadError(e);
243 }
244
245 if (models.isEmpty()) return null;
246
247 Integer[] modelsArray = new Integer[models.size()];
248 models.toArray(modelsArray);
249 return modelsArray;
250 }
251
252 /*---------------------------- private methods --------------------------*/
253
254 private void parseCifFile() throws IOException, CiffileFormatError, PdbChainCodeNotFoundError{
255
256 if (!fieldsTitlesRead) {
257 readFieldsTitles();
258 }
259 // now reading separate elements separately using private methods
260 // the order in the elements in the file is not guaranteed, that's why (among other reasons) we have to use RandomAccessFile
261 this.pdbCode = readPdbCode();
262 readAtomAltLocs(); // sets altLoc String (needed in readAtomSite to get the right alt atom locations)
263 readPdbxPolySeq(); // sets chainCode, sequence, pdbresser2resser
264 readAtomSite(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser
265 secondaryStructure = new SecondaryStructure(this.sequence); // create empty secondary structure first to make sure object is not null
266 readSecStructure(); // populates secondaryStructure
267
268 }
269
270 private void readFieldsTitles() throws IOException, CiffileFormatError {
271 // data structures to store the parsed fields
272 ids2elements = new TreeMap<String, Integer>();
273 fields2indices = new TreeMap<String,Integer>();
274 fields2values = new TreeMap<String, String>();
275 loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type
276 loopelements2contentOffset = new TreeMap<Integer,Long[]>();
277 ids2fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id
278
279 int element = 0;
280 String line;
281 line = fcif.readLine(); // read first line
282 Pattern p = Pattern.compile("^data_\\d\\w\\w\\w");
283 if (!p.matcher(line).find()){
284 throw new CiffileFormatError("The file doesn't seem to be a cif file");
285 }
286 int linecount = 1; // we have read one line already, we initialise count to 1
287 // we need to store the last line's byte offset (which indicates the beginning of this line)
288 long lastLineOffset=fcif.getFilePointer();
289 while((line = fcif.readLine()) != null ) {
290 long currentOffset = fcif.getFilePointer(); //this gets byte offset at end of line
291 linecount++;
292 if (line.startsWith("#")) {
293 element++;
294 continue;
295 }
296 if (line.startsWith("loop_")) {
297 loopElements.add(element);
298 continue;
299 }
300
301 for (String id:ids){
302 if (!ids2fieldsIdx.containsKey(id)) ids2fieldsIdx.put(id,0);
303 p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$");
304 Matcher m = p.matcher(line);
305 if (m.find()){
306 ids2elements.put(id,element);
307 String field = id + "." + m.group(1);
308 if (!loopElements.contains(element)) { // if not a loop element
309 fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name
310 } else { // for loop elements we fill the fields2indices TreeMap
311 fields2indices.put(field,ids2fieldsIdx.get(id));
312 }
313 ids2fieldsIdx.put(id,ids2fieldsIdx.get(id)+1);
314 continue;
315 }
316 }
317 if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element
318 if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids)
319 if (!loopelements2contentOffset.containsKey(element)) {
320 //loopelements2content.put(element,line+"\n");
321 Long[] interval = {lastLineOffset, currentOffset};
322 loopelements2contentOffset.put(element,interval);
323 } else {
324 //loopelements2content.put(element,loopelements2content.get(element)+line+"\n");
325 loopelements2contentOffset.get(element)[1]=currentOffset;
326 }
327 }
328 }
329 lastLineOffset = currentOffset; //we store this line's offset to have it for next iteration
330 } // end scanning lines
331
332 fieldsTitlesRead = true;
333 }
334
335 private String readPdbCode(){
336 return fields2values.get(entryId+".id").trim().toLowerCase();
337 }
338
339 private void readAtomAltLocs() throws IOException, CiffileFormatError {
340 // The read of the atom_sites_alt element must be done previously to scanning the atom_site element
341 // This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site
342 // (and altLoc needs to be set before starting reading the atom_site element)
343
344 ArrayList<String> altLocs = new ArrayList<String>();
345 // we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value
346 altLoc = ".";
347
348 // atom_sites_alt element is optional
349 Long[] intAtomSitesAlt = null;
350 if (ids2elements.containsKey(atomSitesAltId)){
351 intAtomSitesAlt = loopelements2contentOffset.get(ids2elements.get(atomSitesAltId));
352 }
353
354 int recordCount = 0;
355 // atom_sites_alt (optional element)
356 if (intAtomSitesAlt!=null) {
357
358 fcif.seek(intAtomSitesAlt[0]);
359 while(fcif.getFilePointer()<intAtomSitesAlt[1]) {
360 recordCount++;
361
362 int idIdx = fields2indices.get(atomSitesAltId+".id");
363 // id=0
364 // A ?
365 int numberFields = ids2fieldsIdx.get(atomSitesAltId);
366 String[] tokens = tokeniseFields(numberFields);
367 if (tokens.length!=numberFields) {
368 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSitesAltId);
369 }
370 if (!tokens[idIdx].equals(".")) {
371 altLocs.add(tokens[idIdx]);
372 }
373 }
374 }
375 if (!altLocs.isEmpty()){
376 altLoc = Collections.min(altLocs);
377 }
378 }
379
380 private void readAtomSite() throws IOException, PdbChainCodeNotFoundError, CiffileFormatError {
381 resser_atom2atomserial = new HashMap<String,Integer>();
382 resser2restype = new HashMap<Integer,String>();
383 atomser2coord = new HashMap<Integer,Point3d>();
384 atomser2resser = new HashMap<Integer,Integer>();
385
386 Long[] intAtomSite = loopelements2contentOffset.get(ids2elements.get(atomSiteId));
387
388 boolean empty = true;
389 int recordCount = 0;
390
391 fcif.seek(intAtomSite[0]);
392 while(fcif.getFilePointer()<intAtomSite[1]) {
393 recordCount++;
394
395 int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB");
396 int idIdx = fields2indices.get(atomSiteId+".id");
397 int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id");
398 int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id");
399 int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id");
400 int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id");
401 int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id");
402 int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x");
403 int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y");
404 int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z");
405 int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num");
406 // group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12
407 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24
408 //ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1
409 int numberFields = ids2fieldsIdx.get(atomSiteId);
410 String[] tokens = tokeniseFields(numberFields);
411 if (tokens.length!=numberFields) {
412 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+atomSiteId);
413 }
414 if (tokens[groupPdbIdx].equals("ATOM") && tokens[labelAsymIdIdx].equals(chainCode) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model
415 empty = false;
416 if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc
417 int atomserial=Integer.parseInt(tokens[idIdx]); // id
418 String atom = tokens[labelAtomIdIdx]; // label_atom_id
419 String res_type = tokens[labelCompIdIdx]; // label_comp_id
420 int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id
421 double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x
422 double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y
423 double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z
424 Point3d coords = new Point3d(x,y,z);
425 if (AAinfo.isValidAA(res_type)) {
426 atomser2coord.put(atomserial, coords);
427 atomser2resser.put(atomserial, res_serial);
428 resser2restype.put(res_serial, res_type);
429 if (AAinfo.isValidAtomWithOXT(res_type,atom)){
430 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
431 }
432 }
433 }
434 }
435 }
436 if (empty) { // no atom data was found for given pdb chain code and model
437 throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model);
438 }
439 }
440
441 private void readPdbxPolySeq() throws IOException, CiffileFormatError {
442 pdbresser2resser = new HashMap<String, Integer>();
443 sequence = "";
444
445 String chainCodeStr=pdbChainCode;
446 if (pdbChainCode.equals(Pdb.NULL_CHAIN_CODE)) chainCodeStr="A";
447
448 Long[] intPdbxPoly = loopelements2contentOffset.get(ids2elements.get(pdbxPolySeqId));
449
450 int recordCount=0;
451
452 fcif.seek(intPdbxPoly[0]);
453 while(fcif.getFilePointer()<intPdbxPoly[1]) {
454 recordCount++;
455
456 int asymIdIdx = fields2indices.get(pdbxPolySeqId+".asym_id");
457 int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id");
458 //int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num");
459 int pdbSeqNumIdx = fields2indices.get(pdbxPolySeqId+".pdb_seq_num");
460 int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code");
461 int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id");
462 int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id");
463 // asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3
464 // 0 1 2 3 4 5 6 7 8 910
465 // A 1 1 ASP 1 1 1 ASP ASP A .
466 int numberFields = ids2fieldsIdx.get(pdbxPolySeqId);
467 String[] tokens = tokeniseFields(numberFields);
468 if (tokens.length!=numberFields) {
469 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+pdbxPolySeqId);
470 }
471 if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site)
472 int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id
473 chainCode = tokens[asymIdIdx];
474 String pdb_res_serial = tokens[pdbSeqNumIdx]; // pdb_seq_num
475 String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code
476 String pdb_res_serial_with_icode = pdb_res_serial;
477 if (!pdb_ins_code.equals(".")) {
478 pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code;
479 }
480 String res_type = tokens[monIdIdx]; // mon_id
481 // sequence
482 if (AAinfo.isValidAA(res_type)){
483 sequence+=AAinfo.threeletter2oneletter(res_type);
484 } else {
485 sequence+=AAinfo.NONSTANDARD_AA_ONE_LETTER;
486 }
487 // pdbresser2resser
488 pdbresser2resser.put(pdb_res_serial_with_icode,res_serial);
489
490 }
491 }
492 }
493
494 private void readSecStructure() throws IOException, CiffileFormatError {
495 secondaryStructure = new SecondaryStructure(this.sequence);
496
497 // struct_conf element is optional
498 Long[] intStructConf = null;
499 if (ids2elements.containsKey(structConfId)) {
500 // if not a loop element then intStructConf stays null (because loopelements2contentIndex will return null)
501 intStructConf = loopelements2contentOffset.get(ids2elements.get(structConfId));
502 }
503 // taking care of cases where struct_conf is not a loop element but a one value field
504 if (ids2elements.containsKey(structConfId) && !loopElements.contains(ids2elements.get(structConfId))){
505 String begChainCode = fields2values.get(structConfId+".beg_label_asym_id").trim();
506 if (begChainCode.equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme
507 String id = fields2values.get(structConfId+".id").trim();
508 int beg = Integer.parseInt(fields2values.get(structConfId+".beg_label_seq_id").trim());
509 int end = Integer.parseInt(fields2values.get(structConfId+".end_label_seq_id").trim());
510 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
511 Matcher m = p.matcher(id);
512 String ssId="Unknown";
513 if (m.find()){
514 ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
515 }
516 char ssType = SecStrucElement.OTHER;
517 if(id.startsWith("H")) {
518 ssType = SecStrucElement.HELIX;
519 } else if(id.startsWith("T")) {
520 ssType = SecStrucElement.TURN;
521 } else {
522 System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
523 }
524 if(ssType != SecStrucElement.OTHER) {
525 SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
526 secondaryStructure.add(ssElem);
527 }
528 }
529 }
530 // struct_sheet_range element is optional
531 Long[] intStructSheet = null;
532 if (ids2elements.containsKey(structSheetId)) {
533 // if not a loop element intStructSheet stays null (because loopelements2contentIndex will return null)
534 intStructSheet = loopelements2contentOffset.get(ids2elements.get(structSheetId));
535 }
536 // taking care of cases where struct_sheet_range is not a loop element but a one value field
537 if (ids2elements.containsKey(structSheetId) && !loopElements.contains(ids2elements.get(structSheetId))){
538 String begChainCode = fields2values.get(structSheetId+".beg_label_asym_id").trim();
539 if (begChainCode.equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme
540 String sheetid = fields2values.get(structSheetId+".sheet_id").trim(); //tokens[sheetIdIdx];
541 int id = Integer.parseInt(fields2values.get(structSheetId+".id").trim()); //Integer.parseInt(tokens[idIdx]);
542 int beg = Integer.parseInt(fields2values.get(structSheetId+".beg_label_seq_id").trim()); //tokens[begLabelSeqIdIdx]);
543 int end = Integer.parseInt(fields2values.get(structSheetId+".end_label_seq_id").trim()); //tokens[endLabelSeqIdIdx]);
544 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
545 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
546 secondaryStructure.add(ssElem);
547 }
548
549 }
550
551 if (intStructConf!=null) {
552 int recordCount=0;
553
554 fcif.seek(intStructConf[0]);
555 while(fcif.getFilePointer()<intStructConf[1]) {
556 recordCount++;
557 // struct_conf (optional element), HELIX and TURN secondary structure
558
559 int idIdx = fields2indices.get(structConfId+".id");
560 int begLabelAsymIdIdx = fields2indices.get(structConfId+".beg_label_asym_id");
561 int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id");
562 int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id");
563 //id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4
564 // 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19
565 //HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11
566 int numFields = ids2fieldsIdx.get(structConfId);
567 String[] tokens = tokeniseFields(numFields);
568 if (tokens.length!=numFields) {
569 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structConfId);
570 }
571 if (tokens[begLabelAsymIdIdx].equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme
572 String id = tokens[idIdx];
573 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
574 Matcher m = p.matcher(id);
575 String ssId="Unknown";
576 if (m.find()){
577 ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
578 }
579 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
580 int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
581 char ssType = SecStrucElement.OTHER;
582 if(id.startsWith("H")) {
583 ssType = SecStrucElement.HELIX;
584 } else if(id.startsWith("T")) {
585 ssType = SecStrucElement.TURN;
586 } else {
587 System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
588 }
589 if(ssType != SecStrucElement.OTHER) {
590 SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
591 secondaryStructure.add(ssElem);
592 }
593 }
594 }
595 }
596 if (intStructSheet!=null) {
597 int recordCount=0;
598
599 fcif.seek(intStructSheet[0]);
600 while(fcif.getFilePointer()<intStructSheet[1]) {
601 recordCount++;
602 // struct_sheet_range (optional element), SHEETs
603 int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id");
604 int idIdx = fields2indices.get(structSheetId+".id");
605 int begLabelAsymIdIdx = fields2indices.get(structSheetId+".beg_label_asym_id");
606 int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id");
607 int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id");
608 //sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3
609 //0 1 2 3 4 5 6 7 8 910 1112 13 1415 16
610 //A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19
611 int numFields = ids2fieldsIdx.get(structSheetId);
612 String[] tokens = tokeniseFields(numFields);
613 if (tokens.length!=numFields) {
614 throw new CiffileFormatError("Incorrect number of fields for record "+recordCount+" in loop element "+structSheetId);
615 }
616 if (tokens[begLabelAsymIdIdx].equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme
617 String sheetid = tokens[sheetIdIdx];
618 int id = Integer.parseInt(tokens[idIdx]);
619 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
620 int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
621 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
622 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
623 secondaryStructure.add(ssElem);
624 }
625 }
626 }
627 }
628
629 /**
630 * Splits a space separated line into its individual tokens returning an array with all tokens
631 * Takes care of all particularities of the format of a record in the ciffiles:
632 * - fields within records are separated by spaces
633 * - spaces can be used within quoted strings (at the moment this only supports single quotes, not double)
634 * - free style with all characters allowed if something is quoted with \n; ;\n
635 * The java class StreamTokenizer could have done all this, but it was limited to do all that we needed to do
636 *
637 *
638 * This method is black magic. I don't fully understand it myself as I write it.
639 * If you need to come back to this and read it, good luck!!
640 *
641 * @param numberTokens
642 * @return
643 */
644 private String[] tokeniseFields(int numberTokens) throws IOException {
645 String[] tokens = new String[numberTokens];
646 // initialise tokens to empty strings
647 for (int i=0; i<numberTokens;i++){
648 tokens[i]="";
649 }
650
651 int i = 0;
652 char lastChar=' ';
653 char quoteChar = 0;
654 while (true) {
655 char currentChar = (char)fcif.readByte();
656
657 // '' quoting
658 if (quoteChar!=';' && currentChar=='\'' && (lastChar==' ' || lastChar=='\n')){
659 quoteChar = '\'';
660 }
661 else if (quoteChar!=';' && currentChar==' ' && lastChar=='\''){
662 quoteChar = 0;
663 }
664 // "" quoting
665 if (quoteChar!=';' && currentChar=='"' && (lastChar==' ' || lastChar=='\n')){
666 quoteChar = '"';
667 }
668 else if (quoteChar!=';' && currentChar==' ' && lastChar=='"'){
669 quoteChar = 0;
670 }
671 // ;; quoting (multi-line quoting)
672 if (quoteChar!=';' && currentChar==';' && lastChar=='\n'){
673 quoteChar = ';';
674 }
675 else if (quoteChar==';' && currentChar==';' && lastChar=='\n'){
676 quoteChar = 0;
677 }
678
679 // reading field
680 if (quoteChar==0) { // not within quotes
681 if (currentChar==' ' || currentChar=='\n') {
682 if (currentChar!=lastChar && !(currentChar=='\n' && lastChar==' ')) i++; // we only increment when we move from a non-space to a space or from non-space to \n
683 } else {
684 tokens[i]+=currentChar;
685 // if we are adding the last ; of a ;;-quoted string then strip the starting ';' and ending "\n;" out
686 if (currentChar==';' && lastChar=='\n' && tokens[i].startsWith(";") && tokens[i].endsWith("\n;")) {
687 tokens[i]=tokens[i].replaceFirst("^;", "");
688 tokens[i]=tokens[i].replaceFirst("\n;","");
689 }
690 }
691 } else { // within quotes (of type '', "" or ;;)
692 tokens[i]+=currentChar;
693 // if string is surrounded by '' or "" then strip them out (except when string is length 1 and thus beginning and end are quoteChar)
694 if (tokens[i].length()!=1 && tokens[i].startsWith(Character.toString(quoteChar)) && tokens[i].endsWith(Character.toString(quoteChar))) tokens[i]=tokens[i].replaceAll(Character.toString(quoteChar), "");
695
696 }
697
698 lastChar = currentChar;
699
700 if (i==numberTokens) {
701 // for the last record of an element it is important to have read up to the end of the line (including the '\n'),
702 // otherwise the condition : "while (current_pointer<max_pointer_of_this_element)" won't work
703 // we read one more character at a time: test whether it is a ' ' or a '\n', if not then we have overread so we need to rewind back
704 while (true) {
705 long currentPos = fcif.getFilePointer(); // get current position to rewind back to it if needed
706 currentChar = (char) fcif.readByte();
707 if (currentChar!='\n' && currentChar!=' '){
708 fcif.seek(currentPos);
709 break;
710 }
711 }
712 return tokens;
713 }
714 }
715 }
716
717 }