1 |
package proteinstructure; |
2 |
|
3 |
import java.io.BufferedReader; |
4 |
import java.io.File; |
5 |
|
6 |
import java.io.FileReader; |
7 |
import java.io.IOException; |
8 |
import java.util.ArrayList; |
9 |
import java.util.Collections; |
10 |
import java.util.HashMap; |
11 |
import java.util.TreeMap; |
12 |
import java.util.TreeSet; |
13 |
import java.util.regex.Matcher; |
14 |
import java.util.regex.Pattern; |
15 |
|
16 |
import javax.vecmath.Point3d; |
17 |
|
18 |
|
19 |
/** |
20 |
* A single chain pdb protein structure loaded from a mmCIF file |
21 |
* |
22 |
* @author Jose Duarte |
23 |
* Class: CiffilePdb |
24 |
* Package: proteinstructure |
25 |
*/ |
26 |
public class CiffilePdb extends Pdb { |
27 |
|
28 |
|
29 |
private String ciffile; |
30 |
|
31 |
// fields we will read |
32 |
private static final String entryId = "_entry"; |
33 |
private static final String atomSiteId = "_atom_site"; |
34 |
private static final String atomSitesAltId = "_atom_sites_alt"; |
35 |
private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme"; |
36 |
private static final String structConfId = "_struct_conf"; |
37 |
private static final String structSheetId = "_struct_sheet_range"; |
38 |
private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId}; |
39 |
|
40 |
private TreeMap<String,Integer> ids2elements; // map of ids to element serials |
41 |
private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements) |
42 |
private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements) |
43 |
private TreeSet<Integer> loopElements; // contains list of elements that are of loop type |
44 |
private TreeMap<Integer,Interval> loopelements2contentIndex; // begin and end line index of each loop element |
45 |
|
46 |
/** |
47 |
* Constructs Pdb object given pdb code and pdb chain code. |
48 |
* Model will be DEFAULT_MODEL |
49 |
* @param ciffile |
50 |
* @param pdbChainCode |
51 |
* @throws PdbChainCodeNotFoundError |
52 |
* @throws IOException |
53 |
*/ |
54 |
public CiffilePdb (String ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException { |
55 |
this(ciffile, pdbChainCode, DEFAULT_MODEL); |
56 |
} |
57 |
|
58 |
/** |
59 |
* Constructs Pdb object given pdb code, pdb chain code, model serial, source db and a MySQLConnection. |
60 |
* The db must be a pdbase database |
61 |
* @param ciffile |
62 |
* @param pdbChainCode |
63 |
* @param model_serial |
64 |
* @throws PdbChainCodeNotFoundError |
65 |
* @throws IOException |
66 |
*/ |
67 |
public CiffilePdb (String ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException { |
68 |
this.ciffile = ciffile; |
69 |
this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case |
70 |
this.model=model_serial; |
71 |
|
72 |
parseCifFile(); |
73 |
|
74 |
this.pdbCode = readPdbCode(); |
75 |
|
76 |
secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null |
77 |
|
78 |
parseLoopElements(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser, sequence, pdbresser2resser, secondaryStructure |
79 |
|
80 |
this.fullLength = sequence.length(); |
81 |
|
82 |
this.obsLength = resser2restype.size(); |
83 |
|
84 |
if(!secondaryStructure.isEmpty()) { |
85 |
secondaryStructure.setComment("CIFfile"); |
86 |
} |
87 |
|
88 |
// we initialise resser2pdbresser from the pdbresser2resser HashMap |
89 |
this.resser2pdbresser = new HashMap<Integer, String>(); |
90 |
for (String pdbresser:pdbresser2resser.keySet()){ |
91 |
resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser); |
92 |
} |
93 |
|
94 |
// initialising atomser2atom from resser_atom2atomserial |
95 |
atomser2atom = new HashMap<Integer, String>(); |
96 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
97 |
int atomserial = resser_atom2atomserial.get(resser_atom); |
98 |
String atom = resser_atom.split("_")[1]; |
99 |
atomser2atom.put(atomserial,atom); |
100 |
} |
101 |
} |
102 |
|
103 |
private void parseCifFile() throws IOException{ |
104 |
// data structures to store the parsed fields |
105 |
ids2elements = new TreeMap<String, Integer>(); |
106 |
fields2indices = new TreeMap<String,Integer>(); |
107 |
fields2values = new TreeMap<String, String>(); |
108 |
loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type |
109 |
loopelements2contentIndex = new TreeMap<Integer,Interval>(); |
110 |
TreeMap<String,Integer> fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id |
111 |
|
112 |
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
113 |
int element = 0; |
114 |
String line; |
115 |
line = fcif.readLine(); // skip first line |
116 |
int linecount = 1; // we have read one line already, we initialise count to 1 |
117 |
while((line = fcif.readLine()) != null ) { |
118 |
linecount++; |
119 |
if (line.startsWith("#")) { |
120 |
element++; |
121 |
continue; |
122 |
} |
123 |
if (line.startsWith("loop_")) { |
124 |
loopElements.add(element); |
125 |
continue; |
126 |
} |
127 |
|
128 |
for (String id:ids){ |
129 |
if (!fieldsIdx.containsKey(id)) fieldsIdx.put(id,0); |
130 |
Pattern p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$"); |
131 |
Matcher m = p.matcher(line); |
132 |
if (m.find()){ |
133 |
ids2elements.put(id,element); |
134 |
String field = id + "." + m.group(1); |
135 |
if (!loopElements.contains(element)) { // if not a loop element |
136 |
fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name |
137 |
} else { // for loop elements we fill the fields2indices TreeMap |
138 |
fields2indices.put(field,fieldsIdx.get(id)); |
139 |
} |
140 |
fieldsIdx.put(id,fieldsIdx.get(id)+1); |
141 |
continue; |
142 |
} |
143 |
} |
144 |
if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element |
145 |
if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids) |
146 |
if (!loopelements2contentIndex.containsKey(element)) { |
147 |
//loopelements2content.put(element,line+"\n"); |
148 |
Interval interval = new Interval(linecount,linecount); |
149 |
loopelements2contentIndex.put(element,interval); |
150 |
} else { |
151 |
//loopelements2content.put(element,loopelements2content.get(element)+line+"\n"); |
152 |
loopelements2contentIndex.get(element).end=linecount; |
153 |
} |
154 |
} |
155 |
} |
156 |
} // end scanning lines |
157 |
|
158 |
fcif.close(); |
159 |
} |
160 |
|
161 |
private String readPdbCode(){ |
162 |
return fields2values.get("_entry.id").trim(); |
163 |
} |
164 |
|
165 |
private void parseLoopElements() throws IOException, PdbChainCodeNotFoundError { |
166 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
167 |
resser2restype = new HashMap<Integer,String>(); |
168 |
atomser2coord = new HashMap<Integer,Point3d>(); |
169 |
atomser2resser = new HashMap<Integer,Integer>(); |
170 |
pdbresser2resser = new HashMap<String, Integer>(); |
171 |
sequence = ""; |
172 |
secondaryStructure = new SecondaryStructure(); |
173 |
|
174 |
ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids |
175 |
|
176 |
String chainCodeStr=pdbChainCode; |
177 |
if (pdbChainCode.equals("NULL")) chainCodeStr="A"; |
178 |
|
179 |
ArrayList<String> altLocs = new ArrayList<String>(); |
180 |
String altLoc = "."; |
181 |
|
182 |
Interval intAtomSite = loopelements2contentIndex.get(ids2elements.get(atomSiteId)); |
183 |
// atom_sites_alt element is optional |
184 |
Interval intAtomSitesAlt = null; |
185 |
if (ids2elements.containsKey(atomSitesAltId)){ |
186 |
intAtomSitesAlt = loopelements2contentIndex.get(ids2elements.get(atomSitesAltId)); |
187 |
} |
188 |
Interval intPdbxPoly = loopelements2contentIndex.get(ids2elements.get(pdbxPolySeqId)); |
189 |
// struct_conf element is optional |
190 |
Interval intStructConf = null; |
191 |
if (ids2elements.containsKey(structConfId)){ |
192 |
intStructConf = loopelements2contentIndex.get(ids2elements.get(structConfId)); |
193 |
} |
194 |
// struct_sheet_range element is optional |
195 |
Interval intStructSheet = null; |
196 |
if (ids2elements.containsKey(structSheetId)) { |
197 |
intStructSheet = loopelements2contentIndex.get(ids2elements.get(structSheetId)); |
198 |
} |
199 |
|
200 |
boolean empty = true; |
201 |
BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile))); |
202 |
String line; |
203 |
int linecount=0; |
204 |
while((line = fcif.readLine()) != null ) { |
205 |
linecount++; |
206 |
// atom_sites_alt (optional element) |
207 |
if (intAtomSitesAlt!=null && linecount>=intAtomSitesAlt.beg && linecount<=intAtomSitesAlt.end){ |
208 |
int idIdx = fields2indices.get(atomSitesAltId+".id"); |
209 |
// id=0 |
210 |
// A ? |
211 |
String[] tokens = line.split("\\s+"); |
212 |
if (!tokens[idIdx].equals(".")) { |
213 |
altLocs.add(tokens[idIdx]); |
214 |
} |
215 |
continue; |
216 |
} |
217 |
if (!altLocs.isEmpty()){ |
218 |
altLoc = Collections.min(altLocs); |
219 |
} |
220 |
// atom_site |
221 |
if (linecount>=intAtomSite.beg && linecount<=intAtomSite.end){ |
222 |
int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB"); |
223 |
int idIdx = fields2indices.get(atomSiteId+".id"); |
224 |
int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id"); |
225 |
int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id"); |
226 |
int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id"); |
227 |
int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id"); |
228 |
int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id"); |
229 |
int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x"); |
230 |
int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y"); |
231 |
int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z"); |
232 |
int authAsymIdIdx = fields2indices.get(atomSiteId+".auth_asym_id"); |
233 |
int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num"); |
234 |
// group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12 |
235 |
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24 |
236 |
//ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1 |
237 |
String[] tokens = line.split("\\s+"); |
238 |
if (tokens[groupPdbIdx].equals("ATOM") && tokens[authAsymIdIdx].equals(chainCodeStr) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model |
239 |
empty = false; |
240 |
if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc |
241 |
int atomserial=Integer.parseInt(tokens[idIdx]); // id |
242 |
String atom = tokens[labelAtomIdIdx]; // label_atom_id |
243 |
String res_type = tokens[labelCompIdIdx]; // label_comp_id |
244 |
chainCode = tokens[labelAsymIdIdx]; // label_asym_id |
245 |
int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id |
246 |
double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x |
247 |
double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y |
248 |
double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z |
249 |
Point3d coords = new Point3d(x,y,z); |
250 |
if (aalist.contains(res_type)) { |
251 |
atomser2coord.put(atomserial, coords); |
252 |
atomser2resser.put(atomserial, res_serial); |
253 |
resser2restype.put(res_serial, res_type); |
254 |
ArrayList<String> atomlist = aas2atoms.get(res_type); |
255 |
atomlist.add("OXT"); // the extra atom OXT is there in the last residue of the chain |
256 |
if (atomlist.contains(atom)){ |
257 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
258 |
} |
259 |
} |
260 |
} |
261 |
} |
262 |
continue; |
263 |
} |
264 |
// pdbx_poly_seq_scheme |
265 |
if (linecount>=intPdbxPoly.beg && linecount<=intPdbxPoly.end){ |
266 |
int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id"); |
267 |
int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num"); |
268 |
int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code"); |
269 |
int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id"); |
270 |
int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id"); |
271 |
// asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3 |
272 |
// 0 1 2 3 4 5 6 7 8 910 |
273 |
// A 1 1 ASP 1 1 1 ASP ASP A . |
274 |
String[] tokens = line.split("\\s+"); |
275 |
if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site) |
276 |
int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id |
277 |
String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num |
278 |
String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code |
279 |
String pdb_res_serial_with_icode = pdb_res_serial; |
280 |
if (!pdb_ins_code.equals(".")) { |
281 |
pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code; |
282 |
} |
283 |
String res_type = tokens[monIdIdx]; // mon_id |
284 |
// sequence |
285 |
if (aalist.contains(res_type)){ |
286 |
sequence+=AA.threeletter2oneletter(res_type); |
287 |
} else { |
288 |
sequence+=NONSTANDARD_AA_LETTER; |
289 |
} |
290 |
// pdbresser2resser |
291 |
if (!pdb_res_serial_with_icode.equals("?")) { // question marks are author missing serials, we don't want them in the map |
292 |
pdbresser2resser.put(pdb_res_serial_with_icode,res_serial); |
293 |
} |
294 |
} |
295 |
continue; |
296 |
} |
297 |
// struct_conf (optional element), HELIX and TURN secondary structure |
298 |
if (intStructConf!=null && linecount>=intStructConf.beg && linecount<=intStructConf.end){ |
299 |
int idIdx = fields2indices.get(structConfId+".id"); |
300 |
int begAuthAsymIdIdx = fields2indices.get(structConfId+".beg_auth_asym_id"); |
301 |
int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id"); |
302 |
int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id"); |
303 |
//id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4 |
304 |
// 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19 |
305 |
//HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11 |
306 |
String[] tokens = line.split("\\s+"); |
307 |
if (tokens[begAuthAsymIdIdx].equals(chainCodeStr)) { // we don't have yet chainCode (normally struct_conf appears before atom_site in cif file), so we need to use the auth_asym_id |
308 |
String id = tokens[idIdx]; |
309 |
Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$"); |
310 |
Matcher m = p.matcher(id); |
311 |
String ssId="Unknown"; |
312 |
if (m.find()){ |
313 |
ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns) |
314 |
} |
315 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
316 |
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
317 |
char ssType = SecStrucElement.OTHER; |
318 |
if(id.startsWith("H")) { |
319 |
ssType = SecStrucElement.HELIX; |
320 |
} else if(id.startsWith("T")) { |
321 |
ssType = SecStrucElement.TURN; |
322 |
} else { |
323 |
System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping."); |
324 |
} |
325 |
if(ssType != SecStrucElement.OTHER) { |
326 |
SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId); |
327 |
secondaryStructure.add(ssElem); |
328 |
} |
329 |
} |
330 |
continue; |
331 |
} |
332 |
// struct_sheet_range (optional element), SHEETs |
333 |
if (intStructSheet!=null && linecount>=intStructSheet.beg && linecount<=intStructSheet.end){ |
334 |
int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id"); |
335 |
int idIdx = fields2indices.get(structSheetId+".id"); |
336 |
int begAuthAsymIdIdx = fields2indices.get(structSheetId+".beg_auth_asym_id"); |
337 |
int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id"); |
338 |
int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id"); |
339 |
//sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3 |
340 |
//0 1 2 3 4 5 6 7 8 910 1112 13 1415 16 |
341 |
//A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19 |
342 |
String[] tokens = line.split("\\s+"); |
343 |
if (tokens[begAuthAsymIdIdx].equals(chainCodeStr)){ // we don't have yet chainCode (normally struct_sheet_range appears before atom_site in cif file), so we need to use the auth_asym_id |
344 |
String sheetid = tokens[sheetIdIdx]; |
345 |
int id = Integer.parseInt(tokens[idIdx]); |
346 |
int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]); |
347 |
int end = Integer.parseInt(tokens[endLabelSeqIdIdx]); |
348 |
String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,... |
349 |
SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId); |
350 |
secondaryStructure.add(ssElem); |
351 |
} |
352 |
continue; |
353 |
} |
354 |
} |
355 |
fcif.close(); |
356 |
if (empty) { // no atom data was found for given pdb chain code and model |
357 |
throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model); |
358 |
} |
359 |
} |
360 |
} |