1 |
duarte |
306 |
package proteinstructure; |
2 |
|
|
|
3 |
|
|
import java.io.BufferedReader; |
4 |
|
|
import java.io.File; |
5 |
|
|
|
6 |
|
|
import java.io.FileReader; |
7 |
|
|
import java.io.IOException; |
8 |
|
|
import java.util.ArrayList; |
9 |
|
|
import java.util.Collections; |
10 |
|
|
import java.util.HashMap; |
11 |
|
|
import java.util.TreeMap; |
12 |
|
|
import java.util.TreeSet; |
13 |
|
|
import java.util.regex.Matcher; |
14 |
|
|
import java.util.regex.Pattern; |
15 |
|
|
|
16 |
|
|
import javax.vecmath.Point3d; |
17 |
|
|
|
18 |
|
|
|
19 |
|
|
/** |
20 |
|
|
* A single chain pdb protein structure loaded from a mmCIF file |
21 |
|
|
* |
22 |
|
|
* @author Jose Duarte |
23 |
|
|
* Class: CiffilePdb |
24 |
|
|
* Package: proteinstructure |
25 |
|
|
*/ |
26 |
|
|
public class CiffilePdb extends Pdb { |
27 |
|
|
|
28 |
|
|
|
29 |
|
|
private String ciffile; |
30 |
duarte |
307 |
|
31 |
|
|
// fields we will read |
32 |
|
|
private static final String entryId = "_entry"; |
33 |
|
|
private static final String atomSiteId = "_atom_site"; |
34 |
|
|
private static final String atomSitesAltId = "_atom_sites_alt"; |
35 |
|
|
private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme"; |
36 |
|
|
private static final String structConfId = "_struct_conf"; |
37 |
|
|
private static final String structSheetId = "_struct_sheet_range"; |
38 |
|
|
private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId}; |
39 |
duarte |
306 |
|
40 |
|
|
private TreeMap<String,Integer> ids2elements; // map of ids to element serials |
41 |
duarte |
307 |
private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements) |
42 |
|
|
private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements) |
43 |
duarte |
314 |
private TreeMap<String,Integer> ids2fieldsIdx; // map of element ids to field index counter (after parseCifFile method done it contains the total number of fields per element id) |
44 |
duarte |
306 |
private TreeSet<Integer> loopElements; // contains list of elements that are of loop type |
45 |
|
|
private TreeMap<Integer,Interval> loopelements2contentIndex; // begin and end line index of each loop element |
46 |
duarte |
309 |
|
47 |
|
|
private String altLoc; |
48 |
duarte |
307 |
|
49 |
duarte |
306 |
/** |
50 |
|
|
* Constructs Pdb object given pdb code and pdb chain code. |
51 |
|
|
* Model will be DEFAULT_MODEL |
52 |
|
|
* @param ciffile |
53 |
|
|
* @param pdbChainCode |
54 |
|
|
* @throws PdbChainCodeNotFoundError |
55 |
|
|
* @throws IOException |
56 |
duarte |
314 |
* @throws CiffileFormatError |
57 |
duarte |
306 |
*/ |
58 |
duarte |
314 |
public CiffilePdb (String ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
59 |
duarte |
306 |
this(ciffile, pdbChainCode, DEFAULT_MODEL); |
60 |
|
|
} |
61 |
|
|
|
62 |
|
|
/** |
63 |
|
|
* Constructs Pdb object given pdb code, pdb chain code, model serial, source db and a MySQLConnection. |
64 |
|
|
* The db must be a pdbase database |
65 |
|
|
* @param ciffile |
66 |
|
|
* @param pdbChainCode |
67 |
|
|
* @param model_serial |
68 |
|
|
* @throws PdbChainCodeNotFoundError |
69 |
|
|
* @throws IOException |
70 |
duarte |
314 |
* @throws CiffileFormatError |
71 |
duarte |
306 |
*/ |
72 |
duarte |
314 |
public CiffilePdb (String ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException, CiffileFormatError { |
73 |
duarte |
306 |
this.ciffile = ciffile; |
74 |
|
|
this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case |
75 |
|
|
this.model=model_serial; |
76 |
|
|
|
77 |
|
|
parseCifFile(); |
78 |
|
|
|
79 |
|
|
this.pdbCode = readPdbCode(); |
80 |
|
|
|
81 |
duarte |
315 |
readAtomAltLocs(); // sets altLoc String (needed in readAtomSite to get the right alt atom locations) |
82 |
duarte |
309 |
|
83 |
duarte |
315 |
readPdbxPolySeq(); // sets chainCode, sequence, pdbresser2resser |
84 |
|
|
|
85 |
|
|
readAtomSite(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser |
86 |
|
|
|
87 |
duarte |
306 |
secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null |
88 |
|
|
|
89 |
duarte |
315 |
readSecStructure(); // populates secondaryStructure |
90 |
duarte |
306 |
|
91 |
|
|
this.fullLength = sequence.length(); |
92 |
|
|
|
93 |
|
|
this.obsLength = resser2restype.size(); |
94 |
|
|
|
95 |
|
|
if(!secondaryStructure.isEmpty()) { |
96 |
|
|
secondaryStructure.setComment("CIFfile"); |
97 |
|
|
} |
98 |
|
|
|
99 |
|
|
// we initialise resser2pdbresser from the pdbresser2resser HashMap |
100 |
|
|
this.resser2pdbresser = new HashMap<Integer, String>(); |
101 |
|
|
for (String pdbresser:pdbresser2resser.keySet()){ |
102 |
|
|
resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser); |
103 |
|
|
} |
104 |
|
|
|
105 |
|
|
// initialising atomser2atom from resser_atom2atomserial |
106 |
|
|
atomser2atom = new HashMap<Integer, String>(); |
107 |
|
|
for (String resser_atom:resser_atom2atomserial.keySet()){ |
108 |
|
|
int atomserial = resser_atom2atomserial.get(resser_atom); |
109 |
|
|
String atom = resser_atom.split("_")[1]; |
110 |
|
|
atomser2atom.put(atomserial,atom); |
111 |
|
|
} |
112 |
|
|
} |
113 |
|
|
|
114 |
duarte |
314 |
private void parseCifFile() throws IOException, CiffileFormatError{ |
115 |
duarte |
306 |
// data structures to store the parsed fields |
116 |
|
|
ids2elements = new TreeMap<String, Integer>(); |
117 |
duarte |
307 |
fields2indices = new TreeMap<String,Integer>(); |
118 |
duarte |
306 |
fields2values = new TreeMap<String, String>(); |
119 |
|
|
loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type |
120 |
|
|
loopelements2contentIndex = new TreeMap<Integer,Interval>(); |
121 |
duarte |
314 |
ids2fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id |
122 |
duarte |
306 |
|
123 |
|
|
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
124 |
|
|
int element = 0; |
125 |
|
|
String line; |
126 |
duarte |
314 |
line = fcif.readLine(); // read first line |
127 |
|
|
Pattern p = Pattern.compile("^data_\\d\\w\\w\\w"); |
128 |
|
|
if (!p.matcher(line).find()){ |
129 |
|
|
throw new CiffileFormatError("The file doesn't seem to be a cif file"); |
130 |
|
|
} |
131 |
duarte |
306 |
int linecount = 1; // we have read one line already, we initialise count to 1 |
132 |
|
|
while((line = fcif.readLine()) != null ) { |
133 |
|
|
linecount++; |
134 |
|
|
if (line.startsWith("#")) { |
135 |
|
|
element++; |
136 |
|
|
continue; |
137 |
|
|
} |
138 |
|
|
if (line.startsWith("loop_")) { |
139 |
|
|
loopElements.add(element); |
140 |
|
|
continue; |
141 |
|
|
} |
142 |
|
|
|
143 |
|
|
for (String id:ids){ |
144 |
duarte |
314 |
if (!ids2fieldsIdx.containsKey(id)) ids2fieldsIdx.put(id,0); |
145 |
|
|
p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$"); |
146 |
duarte |
306 |
Matcher m = p.matcher(line); |
147 |
|
|
if (m.find()){ |
148 |
|
|
ids2elements.put(id,element); |
149 |
duarte |
307 |
String field = id + "." + m.group(1); |
150 |
duarte |
306 |
if (!loopElements.contains(element)) { // if not a loop element |
151 |
duarte |
307 |
fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name |
152 |
|
|
} else { // for loop elements we fill the fields2indices TreeMap |
153 |
duarte |
314 |
fields2indices.put(field,ids2fieldsIdx.get(id)); |
154 |
duarte |
306 |
} |
155 |
duarte |
314 |
ids2fieldsIdx.put(id,ids2fieldsIdx.get(id)+1); |
156 |
duarte |
306 |
continue; |
157 |
|
|
} |
158 |
|
|
} |
159 |
|
|
if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element |
160 |
|
|
if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids) |
161 |
|
|
if (!loopelements2contentIndex.containsKey(element)) { |
162 |
|
|
//loopelements2content.put(element,line+"\n"); |
163 |
|
|
Interval interval = new Interval(linecount,linecount); |
164 |
|
|
loopelements2contentIndex.put(element,interval); |
165 |
|
|
} else { |
166 |
|
|
//loopelements2content.put(element,loopelements2content.get(element)+line+"\n"); |
167 |
|
|
loopelements2contentIndex.get(element).end=linecount; |
168 |
|
|
} |
169 |
|
|
} |
170 |
|
|
} |
171 |
|
|
} // end scanning lines |
172 |
|
|
|
173 |
|
|
fcif.close(); |
174 |
|
|
} |
175 |
|
|
|
176 |
|
|
private String readPdbCode(){ |
177 |
duarte |
311 |
return fields2values.get(entryId+".id").trim(); |
178 |
duarte |
306 |
} |
179 |
|
|
|
180 |
duarte |
314 |
private void readAtomAltLocs() throws IOException, CiffileFormatError { |
181 |
duarte |
309 |
// The read of the atom_sites_alt element must be done in a separate scan of the file, previous to scanning the atom_site element |
182 |
|
|
// This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site |
183 |
|
|
// (and altLoc needs to be set before starting reading the atom_site element) |
184 |
|
|
|
185 |
|
|
ArrayList<String> altLocs = new ArrayList<String>(); |
186 |
|
|
// we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value |
187 |
|
|
altLoc = "."; |
188 |
|
|
|
189 |
|
|
// atom_sites_alt element is optional |
190 |
|
|
Interval intAtomSitesAlt = null; |
191 |
|
|
if (ids2elements.containsKey(atomSitesAltId)){ |
192 |
|
|
intAtomSitesAlt = loopelements2contentIndex.get(ids2elements.get(atomSitesAltId)); |
193 |
|
|
} |
194 |
|
|
|
195 |
|
|
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
196 |
|
|
String line; |
197 |
|
|
int linecount=0; |
198 |
|
|
while((line = fcif.readLine()) != null ) { |
199 |
|
|
linecount++; |
200 |
|
|
// atom_sites_alt (optional element) |
201 |
|
|
if (intAtomSitesAlt!=null && linecount>=intAtomSitesAlt.beg && linecount<=intAtomSitesAlt.end){ |
202 |
|
|
int idIdx = fields2indices.get(atomSitesAltId+".id"); |
203 |
|
|
// id=0 |
204 |
|
|
// A ? |
205 |
duarte |
314 |
String[] tokens = tokeniseFields(line); |
206 |
|
|
if (tokens.length!=ids2fieldsIdx.get(atomSitesAltId)) { |
207 |
|
|
throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+atomSitesAltId); |
208 |
|
|
} |
209 |
duarte |
309 |
if (!tokens[idIdx].equals(".")) { |
210 |
|
|
altLocs.add(tokens[idIdx]); |
211 |
|
|
} |
212 |
|
|
} |
213 |
|
|
} |
214 |
|
|
fcif.close(); |
215 |
|
|
if (!altLocs.isEmpty()){ |
216 |
|
|
altLoc = Collections.min(altLocs); |
217 |
|
|
} |
218 |
|
|
} |
219 |
|
|
|
220 |
duarte |
315 |
private void readAtomSite() throws IOException, PdbChainCodeNotFoundError, CiffileFormatError { |
221 |
duarte |
306 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
222 |
|
|
resser2restype = new HashMap<Integer,String>(); |
223 |
|
|
atomser2coord = new HashMap<Integer,Point3d>(); |
224 |
|
|
atomser2resser = new HashMap<Integer,Integer>(); |
225 |
|
|
|
226 |
|
|
ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids |
227 |
|
|
|
228 |
duarte |
307 |
Interval intAtomSite = loopelements2contentIndex.get(ids2elements.get(atomSiteId)); |
229 |
duarte |
306 |
|
230 |
|
|
boolean empty = true; |
231 |
|
|
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
232 |
|
|
String line; |
233 |
|
|
int linecount=0; |
234 |
|
|
while((line = fcif.readLine()) != null ) { |
235 |
|
|
linecount++; |
236 |
|
|
// atom_site |
237 |
duarte |
307 |
if (linecount>=intAtomSite.beg && linecount<=intAtomSite.end){ |
238 |
duarte |
308 |
int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB"); |
239 |
duarte |
307 |
int idIdx = fields2indices.get(atomSiteId+".id"); |
240 |
|
|
int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id"); |
241 |
|
|
int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id"); |
242 |
|
|
int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id"); |
243 |
|
|
int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id"); |
244 |
|
|
int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id"); |
245 |
|
|
int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x"); |
246 |
|
|
int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y"); |
247 |
|
|
int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z"); |
248 |
|
|
int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num"); |
249 |
duarte |
308 |
// group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12 |
250 |
duarte |
306 |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24 |
251 |
|
|
//ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1 |
252 |
duarte |
314 |
String[] tokens = tokeniseFields(line); |
253 |
|
|
if (tokens.length!=ids2fieldsIdx.get(atomSiteId)) { |
254 |
|
|
throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+atomSiteId); |
255 |
|
|
} |
256 |
duarte |
315 |
if (tokens[groupPdbIdx].equals("ATOM") && tokens[labelAsymIdIdx].equals(chainCode) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model |
257 |
duarte |
306 |
empty = false; |
258 |
duarte |
307 |
if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc |
259 |
|
|
int atomserial=Integer.parseInt(tokens[idIdx]); // id |
260 |
|
|
String atom = tokens[labelAtomIdIdx]; // label_atom_id |
261 |
|
|
String res_type = tokens[labelCompIdIdx]; // label_comp_id |
262 |
|
|
int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id |
263 |
|
|
double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x |
264 |
|
|
double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y |
265 |
|
|
double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z |
266 |
duarte |
306 |
Point3d coords = new Point3d(x,y,z); |
267 |
|
|
if (aalist.contains(res_type)) { |
268 |
|
|
atomser2coord.put(atomserial, coords); |
269 |
|
|
atomser2resser.put(atomserial, res_serial); |
270 |
|
|
resser2restype.put(res_serial, res_type); |
271 |
|
|
ArrayList<String> atomlist = aas2atoms.get(res_type); |
272 |
|
|
atomlist.add("OXT"); // the extra atom OXT is there in the last residue of the chain |
273 |
|
|
if (atomlist.contains(atom)){ |
274 |
|
|
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
275 |
|
|
} |
276 |
|
|
} |
277 |
|
|
} |
278 |
|
|
} |
279 |
|
|
continue; |
280 |
|
|
} |
281 |
duarte |
315 |
} |
282 |
|
|
fcif.close(); |
283 |
|
|
if (empty) { // no atom data was found for given pdb chain code and model |
284 |
|
|
throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model); |
285 |
|
|
} |
286 |
|
|
} |
287 |
|
|
|
288 |
|
|
private void readPdbxPolySeq() throws IOException, CiffileFormatError { |
289 |
|
|
pdbresser2resser = new HashMap<String, Integer>(); |
290 |
|
|
sequence = ""; |
291 |
|
|
|
292 |
|
|
ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids |
293 |
|
|
|
294 |
|
|
String chainCodeStr=pdbChainCode; |
295 |
|
|
if (pdbChainCode.equals("NULL")) chainCodeStr="A"; |
296 |
|
|
|
297 |
|
|
Interval intPdbxPoly = loopelements2contentIndex.get(ids2elements.get(pdbxPolySeqId)); |
298 |
|
|
|
299 |
|
|
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
300 |
|
|
String line; |
301 |
|
|
int linecount=0; |
302 |
|
|
while((line = fcif.readLine()) != null ) { |
303 |
|
|
linecount++; |
304 |
duarte |
306 |
// pdbx_poly_seq_scheme |
305 |
|
|
if (linecount>=intPdbxPoly.beg && linecount<=intPdbxPoly.end){ |
306 |
duarte |
315 |
int asymIdIdx = fields2indices.get(pdbxPolySeqId+".asym_id"); |
307 |
duarte |
307 |
int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id"); |
308 |
|
|
int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num"); |
309 |
|
|
int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code"); |
310 |
|
|
int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id"); |
311 |
duarte |
308 |
int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id"); |
312 |
duarte |
306 |
// asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3 |
313 |
|
|
// 0 1 2 3 4 5 6 7 8 910 |
314 |
|
|
// A 1 1 ASP 1 1 1 ASP ASP A . |
315 |
duarte |
314 |
String[] tokens = tokeniseFields(line); |
316 |
|
|
if (tokens.length!=ids2fieldsIdx.get(pdbxPolySeqId)) { |
317 |
|
|
throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+pdbxPolySeqId); |
318 |
|
|
} |
319 |
duarte |
308 |
if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site) |
320 |
duarte |
307 |
int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id |
321 |
duarte |
315 |
chainCode = tokens[asymIdIdx]; |
322 |
duarte |
311 |
//TODO revise: do we want auth_seq_num or pdb_seq_num here?? |
323 |
duarte |
307 |
String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num |
324 |
|
|
String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code |
325 |
duarte |
306 |
String pdb_res_serial_with_icode = pdb_res_serial; |
326 |
|
|
if (!pdb_ins_code.equals(".")) { |
327 |
|
|
pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code; |
328 |
|
|
} |
329 |
duarte |
307 |
String res_type = tokens[monIdIdx]; // mon_id |
330 |
duarte |
306 |
// sequence |
331 |
|
|
if (aalist.contains(res_type)){ |
332 |
|
|
sequence+=AA.threeletter2oneletter(res_type); |
333 |
|
|
} else { |
334 |
|
|
sequence+=NONSTANDARD_AA_LETTER; |
335 |
|
|
} |
336 |
|
|
// pdbresser2resser |
337 |
duarte |
310 |
if (!pdb_res_serial_with_icode.startsWith("?")) { // question marks are author missing serials, we don't want them in the map |
338 |
duarte |
308 |
pdbresser2resser.put(pdb_res_serial_with_icode,res_serial); |
339 |
|
|
} |
340 |
duarte |
306 |
} |
341 |
|
|
continue; |
342 |
|
|
} |
343 |
duarte |
315 |
|
344 |
|
|
} |
345 |
|
|
fcif.close(); |
346 |
|
|
} |
347 |
|
|
|
348 |
|
|
private void readSecStructure() throws IOException, CiffileFormatError { |
349 |
|
|
secondaryStructure = new SecondaryStructure(); |
350 |
|
|
|
351 |
|
|
// struct_conf element is optional |
352 |
|
|
Interval intStructConf = null; |
353 |
|
|
if (ids2elements.containsKey(structConfId)) { |
354 |
|
|
// if not a loop element then intStructConf stays null (because loopelements2contentIndex will return null) |
355 |
|
|
intStructConf = loopelements2contentIndex.get(ids2elements.get(structConfId)); |
356 |
|
|
} |
357 |
|
|
// taking care of cases where struct_conf is not a loop element but a one value field |
358 |
|
|
if (ids2elements.containsKey(structConfId) && !loopElements.contains(ids2elements.get(structConfId))){ |
359 |
|
|
String begChainCode = fields2values.get(structConfId+".beg_label_asym_id").trim(); |
360 |
|
|
if (begChainCode.equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme |
361 |
|
|
String id = fields2values.get(structConfId+".id").trim(); |
362 |
|
|
int beg = Integer.parseInt(fields2values.get(structConfId+".beg_label_seq_id").trim()); |
363 |
|
|
int end = Integer.parseInt(fields2values.get(structConfId+".end_label_seq_id").trim()); |
364 |
|
|
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
365 |
|
|
Matcher m = p.matcher(id); |
366 |
|
|
String ssId="Unknown"; |
367 |
|
|
if (m.find()){ |
368 |
|
|
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
369 |
|
|
} |
370 |
|
|
char ssType = SecStrucElement.OTHER; |
371 |
|
|
if(id.startsWith("H")) { |
372 |
|
|
ssType = SecStrucElement.HELIX; |
373 |
|
|
} else if(id.startsWith("T")) { |
374 |
|
|
ssType = SecStrucElement.TURN; |
375 |
|
|
} else { |
376 |
|
|
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
377 |
|
|
} |
378 |
|
|
if(ssType != SecStrucElement.OTHER) { |
379 |
|
|
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
380 |
|
|
secondaryStructure.add(ssElem); |
381 |
|
|
} |
382 |
|
|
} |
383 |
|
|
} |
384 |
|
|
// struct_sheet_range element is optional |
385 |
|
|
Interval intStructSheet = null; |
386 |
|
|
if (ids2elements.containsKey(structSheetId)) { |
387 |
|
|
// if not a loop element intStructSheet stays null (because loopelements2contentIndex will return null) |
388 |
|
|
intStructSheet = loopelements2contentIndex.get(ids2elements.get(structSheetId)); |
389 |
|
|
} |
390 |
|
|
// taking care of cases where struct_sheet_range is not a loop element but a one value field |
391 |
|
|
if (ids2elements.containsKey(structSheetId) && !loopElements.contains(ids2elements.get(structSheetId))){ |
392 |
|
|
String begChainCode = fields2values.get(structSheetId+".beg_label_asym_id").trim(); |
393 |
|
|
if (begChainCode.equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme |
394 |
|
|
String sheetid = fields2values.get(structSheetId+".sheet_id").trim(); //tokens[sheetIdIdx]; |
395 |
|
|
int id = Integer.parseInt(fields2values.get(structSheetId+".id").trim()); //Integer.parseInt(tokens[idIdx]); |
396 |
|
|
int beg = Integer.parseInt(fields2values.get(structSheetId+".beg_label_seq_id").trim()); //tokens[begLabelSeqIdIdx]); |
397 |
|
|
int end = Integer.parseInt(fields2values.get(structSheetId+".end_label_seq_id").trim()); //tokens[endLabelSeqIdIdx]); |
398 |
|
|
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
399 |
|
|
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
400 |
|
|
secondaryStructure.add(ssElem); |
401 |
|
|
} |
402 |
|
|
|
403 |
|
|
} |
404 |
|
|
|
405 |
|
|
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
406 |
|
|
String line; |
407 |
|
|
int linecount=0; |
408 |
|
|
while((line = fcif.readLine()) != null ) { |
409 |
|
|
linecount++; |
410 |
duarte |
306 |
// struct_conf (optional element), HELIX and TURN secondary structure |
411 |
|
|
if (intStructConf!=null && linecount>=intStructConf.beg && linecount<=intStructConf.end){ |
412 |
duarte |
307 |
int idIdx = fields2indices.get(structConfId+".id"); |
413 |
duarte |
315 |
int begLabelAsymIdIdx = fields2indices.get(structConfId+".beg_label_asym_id"); |
414 |
duarte |
307 |
int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id"); |
415 |
|
|
int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id"); |
416 |
duarte |
306 |
//id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4 |
417 |
|
|
// 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19 |
418 |
|
|
//HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11 |
419 |
duarte |
314 |
String[] tokens = tokeniseFields(line); |
420 |
|
|
if (tokens.length!=ids2fieldsIdx.get(structConfId)) { |
421 |
|
|
throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+structConfId); |
422 |
|
|
} |
423 |
duarte |
315 |
if (tokens[begLabelAsymIdIdx].equals(chainCode)) { // chainCode has been set already in reading pdbx_poly_seq_scheme |
424 |
duarte |
307 |
String id = tokens[idIdx]; |
425 |
duarte |
306 |
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
426 |
|
|
Matcher m = p.matcher(id); |
427 |
|
|
String ssId="Unknown"; |
428 |
|
|
if (m.find()){ |
429 |
|
|
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
430 |
|
|
} |
431 |
duarte |
307 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
432 |
|
|
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
433 |
duarte |
306 |
char ssType = SecStrucElement.OTHER; |
434 |
|
|
if(id.startsWith("H")) { |
435 |
|
|
ssType = SecStrucElement.HELIX; |
436 |
|
|
} else if(id.startsWith("T")) { |
437 |
|
|
ssType = SecStrucElement.TURN; |
438 |
|
|
} else { |
439 |
|
|
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
440 |
|
|
} |
441 |
|
|
if(ssType != SecStrucElement.OTHER) { |
442 |
|
|
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
443 |
|
|
secondaryStructure.add(ssElem); |
444 |
|
|
} |
445 |
|
|
} |
446 |
|
|
continue; |
447 |
|
|
} |
448 |
|
|
// struct_sheet_range (optional element), SHEETs |
449 |
|
|
if (intStructSheet!=null && linecount>=intStructSheet.beg && linecount<=intStructSheet.end){ |
450 |
duarte |
307 |
int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id"); |
451 |
|
|
int idIdx = fields2indices.get(structSheetId+".id"); |
452 |
duarte |
315 |
int begLabelAsymIdIdx = fields2indices.get(structSheetId+".beg_label_asym_id"); |
453 |
duarte |
307 |
int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id"); |
454 |
|
|
int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id"); |
455 |
duarte |
306 |
//sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3 |
456 |
|
|
//0 1 2 3 4 5 6 7 8 910 1112 13 1415 16 |
457 |
|
|
//A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19 |
458 |
duarte |
314 |
String[] tokens = tokeniseFields(line); |
459 |
|
|
if (tokens.length!=ids2fieldsIdx.get(structSheetId)) { |
460 |
|
|
throw new CiffileFormatError("Line "+linecount+" doesn't have the right number of fields for loop element "+structSheetId); |
461 |
|
|
} |
462 |
duarte |
315 |
if (tokens[begLabelAsymIdIdx].equals(chainCode)){ // chainCode has been set already in reading pdbx_poly_seq_scheme |
463 |
duarte |
307 |
String sheetid = tokens[sheetIdIdx]; |
464 |
|
|
int id = Integer.parseInt(tokens[idIdx]); |
465 |
|
|
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
466 |
|
|
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
467 |
duarte |
306 |
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
468 |
|
|
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
469 |
|
|
secondaryStructure.add(ssElem); |
470 |
|
|
} |
471 |
|
|
continue; |
472 |
|
|
} |
473 |
duarte |
315 |
|
474 |
duarte |
306 |
} |
475 |
|
|
fcif.close(); |
476 |
|
|
} |
477 |
duarte |
314 |
|
478 |
|
|
/** |
479 |
|
|
* Splits a space separated line into its individual tokens returning an array with all tokens |
480 |
|
|
* Takes care of quoted fields that contain spaces |
481 |
|
|
* e.g. HELX_P HELX_P2 H4 GLY A 111 ? GLU A 127 ? GLY A 112 GLU A 128 1 'SEE REMARK 650' 17 |
482 |
|
|
* @param line |
483 |
|
|
* @return |
484 |
|
|
*/ |
485 |
|
|
private String[] tokeniseFields(String line) { |
486 |
|
|
String[] tokens; |
487 |
duarte |
315 |
if (line.contains("'")) { // if there are single quotes in the line |
488 |
duarte |
314 |
ArrayList<String> tokensAL = new ArrayList<String>(); |
489 |
|
|
Pattern p = Pattern.compile("'[^']*'|[^ \\t]+"); // note: regex doesn't work inverting the order of expressions in the 'OR' (in python it does!) |
490 |
|
|
Matcher m = p.matcher(line); |
491 |
|
|
while (m.find()){ |
492 |
|
|
tokensAL.add(m.group()); |
493 |
|
|
} |
494 |
|
|
tokens = new String[tokensAL.size()]; |
495 |
|
|
tokensAL.toArray(tokens); |
496 |
|
|
} else { // if no quotes we simply split by columns using spaces as delimiters |
497 |
|
|
tokens = line.split("\\s+"); |
498 |
|
|
} |
499 |
duarte |
315 |
if (line.contains("\"")){ // in some rare cases some fields are quoted with double quotes, this seems to be to escape single quotes within them (used normally as a "prime" symbol) |
500 |
|
|
for (int i=0;i<tokens.length;i++){ // we get rid of the double quoting |
501 |
|
|
tokens[i] = tokens[i].replaceAll("\"", ""); |
502 |
|
|
} |
503 |
|
|
} |
504 |
duarte |
314 |
return tokens; |
505 |
|
|
} |
506 |
|
|
|
507 |
duarte |
306 |
} |