ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/CiffilePdb.java
Revision: 315
Committed: Thu Sep 13 08:13:40 2007 UTC (17 years ago) by duarte
File size: 22939 byte(s)
Log Message:
Now parsing each element in different methods (re-opening the file). Parsing first pdbx_poly_seq_scheme so we get the chainCode that we can use for reading the rest
Now taking care of cases where struct_sheet_range is not a loop element
In tokeniseFields now also unquoting double-quoted strings
Tested on a set of 12000 entries
Line User Rev File contents
1 duarte 306 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5    
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11     import java.util.TreeMap;
12     import java.util.TreeSet;
13     import java.util.regex.Matcher;
14     import java.util.regex.Pattern;
15    
16     import javax.vecmath.Point3d;
17    
18    
19     /**
20     * A single chain pdb protein structure loaded from a mmCIF file
21     *
22     * @author Jose Duarte
23     * Class: CiffilePdb
24     * Package: proteinstructure
25     */
26     public class CiffilePdb extends Pdb {
27    
28    
29     private String ciffile;
30 duarte 307
31     // fields we will read
32     private static final String entryId = "_entry";
33     private static final String atomSiteId = "_atom_site";
34     private static final String atomSitesAltId = "_atom_sites_alt";
35     private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme";
36     private static final String structConfId = "_struct_conf";
37     private static final String structSheetId = "_struct_sheet_range";
38     private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId};
39 duarte 306
40     private TreeMap<String,Integer> ids2elements; // map of ids to element serials
41 duarte 307 private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements)
42     private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements)
43 duarte 314 private TreeMap<String,Integer> ids2fieldsIdx; // map of element ids to field index counter (after parseCifFile method done it contains the total number of fields per element id)
44 duarte 306 private TreeSet<Integer> loopElements; // contains list of elements that are of loop type
45     private TreeMap<Integer,Interval> loopelements2contentIndex; // begin and end line index of each loop element
46 duarte 309
47     private String altLoc;
48 duarte 307
49 duarte 306 /**
50     * Constructs Pdb object given pdb code and pdb chain code.
51     * Model will be DEFAULT_MODEL
52     * @param ciffile
53     * @param pdbChainCode
54     * @throws PdbChainCodeNotFoundError
55     * @throws IOException
56 duarte 314 * @throws CiffileFormatError
57 duarte 306 */
58 duarte 314 public CiffilePdb (String ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
59 duarte 306 this(ciffile, pdbChainCode, DEFAULT_MODEL);
60     }
61    
62     /**
63     * Constructs Pdb object given pdb code, pdb chain code, model serial, source db and a MySQLConnection.
64     * The db must be a pdbase database
65     * @param ciffile
66     * @param pdbChainCode
67     * @param model_serial
68     * @throws PdbChainCodeNotFoundError
69     * @throws IOException
70 duarte 314 * @throws CiffileFormatError
71 duarte 306 */
72 duarte 314 public CiffilePdb (String ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError {
73 duarte 306 this.ciffile = ciffile;
74     this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
75     this.model=model_serial;
76    
77     parseCifFile();
78    
79     this.pdbCode = readPdbCode();
80    
81 duarte 315 readAtomAltLocs(); // sets altLoc String (needed in readAtomSite to get the right alt atom locations)
82 duarte 309
83 duarte 315 readPdbxPolySeq(); // sets chainCode, sequence, pdbresser2resser
84    
85     readAtomSite(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser
86    
87 duarte 306 secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null
88    
89 duarte 315 readSecStructure(); // populates secondaryStructure
90 duarte 306
91     this.fullLength = sequence.length();
92    
93     this.obsLength = resser2restype.size();
94    
95     if(!secondaryStructure.isEmpty()) {
96     secondaryStructure.setComment("CIFfile");
97     }
98    
99     // we initialise resser2pdbresser from the pdbresser2resser HashMap
100     this.resser2pdbresser = new HashMap<Integer, String>();
101     for (String pdbresser:pdbresser2resser.keySet()){
102     resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
103     }
104    
105     // initialising atomser2atom from resser_atom2atomserial
106     atomser2atom = new HashMap<Integer, String>();
107     for (String resser_atom:resser_atom2atomserial.keySet()){
108     int atomserial = resser_atom2atomserial.get(resser_atom);
109     String atom = resser_atom.split("_")[1];
110     atomser2atom.put(atomserial,atom);
111     }
112     }
113    
114 duarte 314 private void parseCifFile() throws IOException, CiffileFormatError{
115 duarte 306 // data structures to store the parsed fields
116     ids2elements = new TreeMap<String, Integer>();
117 duarte 307 fields2indices = new TreeMap<String,Integer>();
118 duarte 306 fields2values = new TreeMap<String, String>();
119     loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type
120     loopelements2contentIndex = new TreeMap<Integer,Interval>();
121 duarte 314 ids2fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id
122 duarte 306
123     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
124     int element = 0;
125     String line;
126 duarte 314 line = fcif.readLine(); // read first line
127     Pattern p = Pattern.compile("^data_\\d\\w\\w\\w");
128     if (!p.matcher(line).find()){
129     throw new CiffileFormatError("The file doesn't seem to be a cif file");
130     }
131 duarte 306 int linecount = 1; // we have read one line already, we initialise count to 1
132     while((line = fcif.readLine()) != null ) {
133     linecount++;
134     if (line.startsWith("#")) {
135     element++;
136     continue;
137     }
138     if (line.startsWith("loop_")) {
139     loopElements.add(element);
140     continue;
141     }
142    
143     for (String id:ids){
144 duarte 314 if (!ids2fieldsIdx.containsKey(id)) ids2fieldsIdx.put(id,0);
145     p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$");
146 duarte 306 Matcher m = p.matcher(line);
147     if (m.find()){
148     ids2elements.put(id,element);
149 duarte 307 String field = id + "." + m.group(1);
150 duarte 306 if (!loopElements.contains(element)) { // if not a loop element
151 duarte 307 fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name
152     } else { // for loop elements we fill the fields2indices TreeMap
153 duarte 314 fields2indices.put(field,ids2fieldsIdx.get(id));
154 duarte 306 }
155 duarte 314 ids2fieldsIdx.put(id,ids2fieldsIdx.get(id)+1);
156 duarte 306 continue;
157     }
158     }
159     if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element
160     if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids)
161     if (!loopelements2contentIndex.containsKey(element)) {
162     //loopelements2content.put(element,line+"\n");
163     Interval interval = new Interval(linecount,linecount);
164     loopelements2contentIndex.put(element,interval);
165     } else {
166     //loopelements2content.put(element,loopelements2content.get(element)+line+"\n");
167     loopelements2contentIndex.get(element).end=linecount;
168     }
169     }
170     }
171     } // end scanning lines
172    
173     fcif.close();
174     }
175    
176     private String readPdbCode(){
177 duarte 311 return fields2values.get(entryId+".id").trim();
178 duarte 306 }
179    
180 duarte 314 private void readAtomAltLocs() throws IOException, CiffileFormatError {
181 duarte 309 // The read of the atom_sites_alt element must be done in a separate scan of the file, previous to scanning the atom_site element
182     // This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site
183     // (and altLoc needs to be set before starting reading the atom_site element)
184    
185     ArrayList<String> altLocs = new ArrayList<String>();
186     // we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value
187     altLoc = ".";
188    
189     // atom_sites_alt element is optional
190     Interval intAtomSitesAlt = null;
191     if (ids2elements.containsKey(atomSitesAltId)){
192     intAtomSitesAlt = loopelements2contentIndex.get(ids2elements.get(atomSitesAltId));
193     }
194    
195     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
196     String line;
197     int linecount=0;
198     while((line = fcif.readLine()) != null ) {
199     linecount++;
200     // atom_sites_alt (optional element)
201     if (intAtomSitesAlt!=null && linecount>=intAtomSitesAlt.beg && linecount<=intAtomSitesAlt.end){
202     int idIdx = fields2indices.get(atomSitesAltId+".id");
203     // id=0
204     // A ?
205 duarte 314 String[] tokens = tokeniseFields(line);
206     if (tokens.length!=ids2fieldsIdx.get(atomSitesAltId)) {
207     throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+atomSitesAltId);
208     }
209 duarte 309 if (!tokens[idIdx].equals(".")) {
210     altLocs.add(tokens[idIdx]);
211     }
212     }
213     }
214     fcif.close();
215     if (!altLocs.isEmpty()){
216     altLoc = Collections.min(altLocs);
217     }
218     }
219    
220 duarte 315 private void readAtomSite() throws IOException, PdbChainCodeNotFoundError, CiffileFormatError {
221 duarte 306 resser_atom2atomserial = new HashMap<String,Integer>();
222     resser2restype = new HashMap<Integer,String>();
223     atomser2coord = new HashMap<Integer,Point3d>();
224     atomser2resser = new HashMap<Integer,Integer>();
225    
226     ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids
227    
228 duarte 307 Interval intAtomSite = loopelements2contentIndex.get(ids2elements.get(atomSiteId));
229 duarte 306
230     boolean empty = true;
231     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
232     String line;
233     int linecount=0;
234     while((line = fcif.readLine()) != null ) {
235     linecount++;
236     // atom_site
237 duarte 307 if (linecount>=intAtomSite.beg && linecount<=intAtomSite.end){
238 duarte 308 int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB");
239 duarte 307 int idIdx = fields2indices.get(atomSiteId+".id");
240     int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id");
241     int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id");
242     int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id");
243     int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id");
244     int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id");
245     int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x");
246     int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y");
247     int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z");
248     int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num");
249 duarte 308 // group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12
250 duarte 306 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24
251     //ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1
252 duarte 314 String[] tokens = tokeniseFields(line);
253     if (tokens.length!=ids2fieldsIdx.get(atomSiteId)) {
254     throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+atomSiteId);
255     }
256 duarte 315 if (tokens[groupPdbIdx].equals("ATOM") && tokens[labelAsymIdIdx].equals(chainCode) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model
257 duarte 306 empty = false;
258 duarte 307 if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc
259     int atomserial=Integer.parseInt(tokens[idIdx]); // id
260     String atom = tokens[labelAtomIdIdx]; // label_atom_id
261     String res_type = tokens[labelCompIdIdx]; // label_comp_id
262     int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id
263     double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x
264     double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y
265     double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z
266 duarte 306 Point3d coords = new Point3d(x,y,z);
267     if (aalist.contains(res_type)) {
268     atomser2coord.put(atomserial, coords);
269     atomser2resser.put(atomserial, res_serial);
270     resser2restype.put(res_serial, res_type);
271     ArrayList<String> atomlist = aas2atoms.get(res_type);
272     atomlist.add("OXT"); // the extra atom OXT is there in the last residue of the chain
273     if (atomlist.contains(atom)){
274     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
275     }
276     }
277     }
278     }
279     continue;
280     }
281 duarte 315 }
282     fcif.close();
283     if (empty) { // no atom data was found for given pdb chain code and model
284     throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model);
285     }
286     }
287    
288     private void readPdbxPolySeq() throws IOException, CiffileFormatError {
289     pdbresser2resser = new HashMap<String, Integer>();
290     sequence = "";
291    
292     ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids
293    
294     String chainCodeStr=pdbChainCode;
295     if (pdbChainCode.equals("NULL")) chainCodeStr="A";
296    
297     Interval intPdbxPoly = loopelements2contentIndex.get(ids2elements.get(pdbxPolySeqId));
298    
299     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
300     String line;
301     int linecount=0;
302     while((line = fcif.readLine()) != null ) {
303     linecount++;
304 duarte 306 // pdbx_poly_seq_scheme
305     if (linecount>=intPdbxPoly.beg && linecount<=intPdbxPoly.end){
306 duarte 315 int asymIdIdx = fields2indices.get(pdbxPolySeqId+".asym_id");
307 duarte 307 int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id");
308     int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num");
309     int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code");
310     int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id");
311 duarte 308 int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id");
312 duarte 306 // asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3
313     // 0 1 2 3 4 5 6 7 8 910
314     // A 1 1 ASP 1 1 1 ASP ASP A .
315 duarte 314 String[] tokens = tokeniseFields(line);
316     if (tokens.length!=ids2fieldsIdx.get(pdbxPolySeqId)) {
317     throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+pdbxPolySeqId);
318     }
319 duarte 308 if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site)
320 duarte 307 int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id
321 duarte 315 chainCode = tokens[asymIdIdx];
322 duarte 311 //TODO revise: do we want auth_seq_num or pdb_seq_num here??
323 duarte 307 String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num
324     String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code
325 duarte 306 String pdb_res_serial_with_icode = pdb_res_serial;
326     if (!pdb_ins_code.equals(".")) {
327     pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code;
328     }
329 duarte 307 String res_type = tokens[monIdIdx]; // mon_id
330 duarte 306 // sequence
331     if (aalist.contains(res_type)){
332     sequence+=AA.threeletter2oneletter(res_type);
333     } else {
334     sequence+=NONSTANDARD_AA_LETTER;
335     }
336     // pdbresser2resser
337 duarte 310 if (!pdb_res_serial_with_icode.startsWith("?")) { // question marks are author missing serials, we don't want them in the map
338 duarte 308 pdbresser2resser.put(pdb_res_serial_with_icode,res_serial);
339     }
340 duarte 306 }
341     continue;
342     }
343 duarte 315
344     }
345     fcif.close();
346     }
347    
348     private void readSecStructure() throws IOException, CiffileFormatError {
349     secondaryStructure = new SecondaryStructure();
350    
351     // struct_conf element is optional
352     Interval intStructConf = null;
353     if (ids2elements.containsKey(structConfId)) {
354     // if not a loop element then intStructConf stays null (because loopelements2contentIndex will return null)
355     intStructConf = loopelements2contentIndex.get(ids2elements.get(structConfId));
356     }
357     // taking care of cases where struct_conf is not a loop element but a one value field
358     if (ids2elements.containsKey(structConfId) && !loopElements.contains(ids2elements.get(structConfId))){
359     String begChainCode = fields2values.get(structConfId+".beg_label_asym_id").trim();
360     if (begChainCode.equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme
361     String id = fields2values.get(structConfId+".id").trim();
362     int beg = Integer.parseInt(fields2values.get(structConfId+".beg_label_seq_id").trim());
363     int end = Integer.parseInt(fields2values.get(structConfId+".end_label_seq_id").trim());
364     Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
365     Matcher m = p.matcher(id);
366     String ssId="Unknown";
367     if (m.find()){
368     ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
369     }
370     char ssType = SecStrucElement.OTHER;
371     if(id.startsWith("H")) {
372     ssType = SecStrucElement.HELIX;
373     } else if(id.startsWith("T")) {
374     ssType = SecStrucElement.TURN;
375     } else {
376     System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
377     }
378     if(ssType != SecStrucElement.OTHER) {
379     SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
380     secondaryStructure.add(ssElem);
381     }
382     }
383     }
384     // struct_sheet_range element is optional
385     Interval intStructSheet = null;
386     if (ids2elements.containsKey(structSheetId)) {
387     // if not a loop element intStructSheet stays null (because loopelements2contentIndex will return null)
388     intStructSheet = loopelements2contentIndex.get(ids2elements.get(structSheetId));
389     }
390     // taking care of cases where struct_sheet_range is not a loop element but a one value field
391     if (ids2elements.containsKey(structSheetId) && !loopElements.contains(ids2elements.get(structSheetId))){
392     String begChainCode = fields2values.get(structSheetId+".beg_label_asym_id").trim();
393     if (begChainCode.equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme
394     String sheetid = fields2values.get(structSheetId+".sheet_id").trim(); //tokens[sheetIdIdx];
395     int id = Integer.parseInt(fields2values.get(structSheetId+".id").trim()); //Integer.parseInt(tokens[idIdx]);
396     int beg = Integer.parseInt(fields2values.get(structSheetId+".beg_label_seq_id").trim()); //tokens[begLabelSeqIdIdx]);
397     int end = Integer.parseInt(fields2values.get(structSheetId+".end_label_seq_id").trim()); //tokens[endLabelSeqIdIdx]);
398     String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
399     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
400     secondaryStructure.add(ssElem);
401     }
402    
403     }
404    
405     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
406     String line;
407     int linecount=0;
408     while((line = fcif.readLine()) != null ) {
409     linecount++;
410 duarte 306 // struct_conf (optional element), HELIX and TURN secondary structure
411     if (intStructConf!=null && linecount>=intStructConf.beg && linecount<=intStructConf.end){
412 duarte 307 int idIdx = fields2indices.get(structConfId+".id");
413 duarte 315 int begLabelAsymIdIdx = fields2indices.get(structConfId+".beg_label_asym_id");
414 duarte 307 int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id");
415     int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id");
416 duarte 306 //id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4
417     // 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19
418     //HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11
419 duarte 314 String[] tokens = tokeniseFields(line);
420     if (tokens.length!=ids2fieldsIdx.get(structConfId)) {
421     throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+structConfId);
422     }
423 duarte 315 if (tokens[begLabelAsymIdIdx].equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme
424 duarte 307 String id = tokens[idIdx];
425 duarte 306 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
426     Matcher m = p.matcher(id);
427     String ssId="Unknown";
428     if (m.find()){
429     ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
430     }
431 duarte 307 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
432     int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
433 duarte 306 char ssType = SecStrucElement.OTHER;
434     if(id.startsWith("H")) {
435     ssType = SecStrucElement.HELIX;
436     } else if(id.startsWith("T")) {
437     ssType = SecStrucElement.TURN;
438     } else {
439     System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
440     }
441     if(ssType != SecStrucElement.OTHER) {
442     SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
443     secondaryStructure.add(ssElem);
444     }
445     }
446     continue;
447     }
448     // struct_sheet_range (optional element), SHEETs
449     if (intStructSheet!=null && linecount>=intStructSheet.beg && linecount<=intStructSheet.end){
450 duarte 307 int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id");
451     int idIdx = fields2indices.get(structSheetId+".id");
452 duarte 315 int begLabelAsymIdIdx = fields2indices.get(structSheetId+".beg_label_asym_id");
453 duarte 307 int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id");
454     int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id");
455 duarte 306 //sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3
456     //0 1 2 3 4 5 6 7 8 910 1112 13 1415 16
457     //A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19
458 duarte 314 String[] tokens = tokeniseFields(line);
459     if (tokens.length!=ids2fieldsIdx.get(structSheetId)) {
460     throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+structSheetId);
461     }
462 duarte 315 if (tokens[begLabelAsymIdIdx].equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme
463 duarte 307 String sheetid = tokens[sheetIdIdx];
464     int id = Integer.parseInt(tokens[idIdx]);
465     int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
466     int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
467 duarte 306 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
468     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
469     secondaryStructure.add(ssElem);
470     }
471     continue;
472     }
473 duarte 315
474 duarte 306 }
475     fcif.close();
476     }
477 duarte 314
478     /**
479     * Splits a space separated line into its individual tokens returning an array with all tokens
480     * Takes care of quoted fields that contain spaces
481     * e.g. HELX_P HELX_P2 H4 GLY A 111 ? GLU A 127 ? GLY A 112 GLU A 128 1 'SEE REMARK 650' 17
482     * @param line
483     * @return
484     */
485     private String[] tokeniseFields(String line) {
486     String[] tokens;
487 duarte 315 if (line.contains("'")) { // if there are single quotes in the line
488 duarte 314 ArrayList<String> tokensAL = new ArrayList<String>();
489     Pattern p = Pattern.compile("'[^']*'|[^ \\t]+"); // note: regex doesn't work inverting the order of expressions in the 'OR' (in python it does!)
490     Matcher m = p.matcher(line);
491     while (m.find()){
492     tokensAL.add(m.group());
493     }
494     tokens = new String[tokensAL.size()];
495     tokensAL.toArray(tokens);
496     } else { // if no quotes we simply split by columns using spaces as delimiters
497     tokens = line.split("\\s+");
498     }
499 duarte 315 if (line.contains("\"")){ // in some rare cases some fields are quoted with double quotes, this seems to be to escape single quotes within them (used normally as a "prime" symbol)
500     for (int i=0;i<tokens.length;i++){ // we get rid of the double quoting
501     tokens[i] = tokens[i].replaceAll("\"", "");
502     }
503     }
504 duarte 314 return tokens;
505     }
506    
507 duarte 306 }