ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/CiffilePdb.java
Revision: 308
Committed: Thu Aug 30 14:54:38 2007 UTC (17 years, 1 month ago) by duarte
File size: 16090 byte(s)
Log Message:
Fixed bugs:
- was reading HETATM lines as well as ATOM in atom_site
- auth_seq_num with '?' not taken now when populating the pdbresser2resser map (same behaviour as in PdbasePdb)
- now using chainCodeStr and auth_asym_id to identify chains in pdbx_poly_seq_scheme, struct_conf and struct_sheet_range. atom_site is not guaranteed to appear in file before all the others so we can't rely on having read a chainCode (asym_id) when parsing the other elements
Line File contents
1 package proteinstructure;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.util.ArrayList;
9 import java.util.Collections;
10 import java.util.HashMap;
11 import java.util.TreeMap;
12 import java.util.TreeSet;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15
16 import javax.vecmath.Point3d;
17
18
19 /**
20 * A single chain pdb protein structure loaded from a mmCIF file
21 *
22 * @author Jose Duarte
23 * Class: CiffilePdb
24 * Package: proteinstructure
25 */
26 public class CiffilePdb extends Pdb {
27
28
29 private String ciffile;
30
31 // fields we will read
32 private static final String entryId = "_entry";
33 private static final String atomSiteId = "_atom_site";
34 private static final String atomSitesAltId = "_atom_sites_alt";
35 private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme";
36 private static final String structConfId = "_struct_conf";
37 private static final String structSheetId = "_struct_sheet_range";
38 private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId};
39
40 private TreeMap<String,Integer> ids2elements; // map of ids to element serials
41 private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements)
42 private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements)
43 private TreeSet<Integer> loopElements; // contains list of elements that are of loop type
44 private TreeMap<Integer,Interval> loopelements2contentIndex; // begin and end line index of each loop element
45
46 /**
47 * Constructs Pdb object given pdb code and pdb chain code.
48 * Model will be DEFAULT_MODEL
49 * @param ciffile
50 * @param pdbChainCode
51 * @throws PdbChainCodeNotFoundError
52 * @throws IOException
53 */
54 public CiffilePdb (String ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException {
55 this(ciffile, pdbChainCode, DEFAULT_MODEL);
56 }
57
58 /**
59 * Constructs Pdb object given pdb code, pdb chain code, model serial, source db and a MySQLConnection.
60 * The db must be a pdbase database
61 * @param ciffile
62 * @param pdbChainCode
63 * @param model_serial
64 * @throws PdbChainCodeNotFoundError
65 * @throws IOException
66 */
67 public CiffilePdb (String ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException {
68 this.ciffile = ciffile;
69 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
70 this.model=model_serial;
71
72 parseCifFile();
73
74 this.pdbCode = readPdbCode();
75
76 secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null
77
78 parseLoopElements(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser, sequence, pdbresser2resser, secondaryStructure
79
80 this.fullLength = sequence.length();
81
82 this.obsLength = resser2restype.size();
83
84 if(!secondaryStructure.isEmpty()) {
85 secondaryStructure.setComment("CIFfile");
86 }
87
88 // we initialise resser2pdbresser from the pdbresser2resser HashMap
89 this.resser2pdbresser = new HashMap<Integer, String>();
90 for (String pdbresser:pdbresser2resser.keySet()){
91 resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
92 }
93
94 // initialising atomser2atom from resser_atom2atomserial
95 atomser2atom = new HashMap<Integer, String>();
96 for (String resser_atom:resser_atom2atomserial.keySet()){
97 int atomserial = resser_atom2atomserial.get(resser_atom);
98 String atom = resser_atom.split("_")[1];
99 atomser2atom.put(atomserial,atom);
100 }
101 }
102
103 private void parseCifFile() throws IOException{
104 // data structures to store the parsed fields
105 ids2elements = new TreeMap<String, Integer>();
106 fields2indices = new TreeMap<String,Integer>();
107 fields2values = new TreeMap<String, String>();
108 loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type
109 loopelements2contentIndex = new TreeMap<Integer,Interval>();
110 TreeMap<String,Integer> fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id
111
112 BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
113 int element = 0;
114 String line;
115 line = fcif.readLine(); // skip first line
116 int linecount = 1; // we have read one line already, we initialise count to 1
117 while((line = fcif.readLine()) != null ) {
118 linecount++;
119 if (line.startsWith("#")) {
120 element++;
121 continue;
122 }
123 if (line.startsWith("loop_")) {
124 loopElements.add(element);
125 continue;
126 }
127
128 for (String id:ids){
129 if (!fieldsIdx.containsKey(id)) fieldsIdx.put(id,0);
130 Pattern p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$");
131 Matcher m = p.matcher(line);
132 if (m.find()){
133 ids2elements.put(id,element);
134 String field = id + "." + m.group(1);
135 if (!loopElements.contains(element)) { // if not a loop element
136 fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name
137 } else { // for loop elements we fill the fields2indices TreeMap
138 fields2indices.put(field,fieldsIdx.get(id));
139 }
140 fieldsIdx.put(id,fieldsIdx.get(id)+1);
141 continue;
142 }
143 }
144 if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element
145 if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids)
146 if (!loopelements2contentIndex.containsKey(element)) {
147 //loopelements2content.put(element,line+"\n");
148 Interval interval = new Interval(linecount,linecount);
149 loopelements2contentIndex.put(element,interval);
150 } else {
151 //loopelements2content.put(element,loopelements2content.get(element)+line+"\n");
152 loopelements2contentIndex.get(element).end=linecount;
153 }
154 }
155 }
156 } // end scanning lines
157
158 fcif.close();
159 }
160
161 private String readPdbCode(){
162 return fields2values.get("_entry.id").trim();
163 }
164
165 private void parseLoopElements() throws IOException, PdbChainCodeNotFoundError {
166 resser_atom2atomserial = new HashMap<String,Integer>();
167 resser2restype = new HashMap<Integer,String>();
168 atomser2coord = new HashMap<Integer,Point3d>();
169 atomser2resser = new HashMap<Integer,Integer>();
170 pdbresser2resser = new HashMap<String, Integer>();
171 sequence = "";
172 secondaryStructure = new SecondaryStructure();
173
174 ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids
175
176 String chainCodeStr=pdbChainCode;
177 if (pdbChainCode.equals("NULL")) chainCodeStr="A";
178
179 ArrayList<String> altLocs = new ArrayList<String>();
180 String altLoc = ".";
181
182 Interval intAtomSite = loopelements2contentIndex.get(ids2elements.get(atomSiteId));
183 // atom_sites_alt element is optional
184 Interval intAtomSitesAlt = null;
185 if (ids2elements.containsKey(atomSitesAltId)){
186 intAtomSitesAlt = loopelements2contentIndex.get(ids2elements.get(atomSitesAltId));
187 }
188 Interval intPdbxPoly = loopelements2contentIndex.get(ids2elements.get(pdbxPolySeqId));
189 // struct_conf element is optional
190 Interval intStructConf = null;
191 if (ids2elements.containsKey(structConfId)){
192 intStructConf = loopelements2contentIndex.get(ids2elements.get(structConfId));
193 }
194 // struct_sheet_range element is optional
195 Interval intStructSheet = null;
196 if (ids2elements.containsKey(structSheetId)) {
197 intStructSheet = loopelements2contentIndex.get(ids2elements.get(structSheetId));
198 }
199
200 boolean empty = true;
201 BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
202 String line;
203 int linecount=0;
204 while((line = fcif.readLine()) != null ) {
205 linecount++;
206 // atom_sites_alt (optional element)
207 if (intAtomSitesAlt!=null && linecount>=intAtomSitesAlt.beg && linecount<=intAtomSitesAlt.end){
208 int idIdx = fields2indices.get(atomSitesAltId+".id");
209 // id=0
210 // A ?
211 String[] tokens = line.split("\\s+");
212 if (!tokens[idIdx].equals(".")) {
213 altLocs.add(tokens[idIdx]);
214 }
215 continue;
216 }
217 if (!altLocs.isEmpty()){
218 altLoc = Collections.min(altLocs);
219 }
220 // atom_site
221 if (linecount>=intAtomSite.beg && linecount<=intAtomSite.end){
222 int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB");
223 int idIdx = fields2indices.get(atomSiteId+".id");
224 int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id");
225 int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id");
226 int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id");
227 int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id");
228 int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id");
229 int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x");
230 int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y");
231 int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z");
232 int authAsymIdIdx = fields2indices.get(atomSiteId+".auth_asym_id");
233 int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num");
234 // group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12
235 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24
236 //ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1
237 String[] tokens = line.split("\\s+");
238 if (tokens[groupPdbIdx].equals("ATOM") && tokens[authAsymIdIdx].equals(chainCodeStr) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model
239 empty = false;
240 if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc
241 int atomserial=Integer.parseInt(tokens[idIdx]); // id
242 String atom = tokens[labelAtomIdIdx]; // label_atom_id
243 String res_type = tokens[labelCompIdIdx]; // label_comp_id
244 chainCode = tokens[labelAsymIdIdx]; // label_asym_id
245 int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id
246 double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x
247 double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y
248 double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z
249 Point3d coords = new Point3d(x,y,z);
250 if (aalist.contains(res_type)) {
251 atomser2coord.put(atomserial, coords);
252 atomser2resser.put(atomserial, res_serial);
253 resser2restype.put(res_serial, res_type);
254 ArrayList<String> atomlist = aas2atoms.get(res_type);
255 atomlist.add("OXT"); // the extra atom OXT is there in the last residue of the chain
256 if (atomlist.contains(atom)){
257 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
258 }
259 }
260 }
261 }
262 continue;
263 }
264 // pdbx_poly_seq_scheme
265 if (linecount>=intPdbxPoly.beg && linecount<=intPdbxPoly.end){
266 int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id");
267 int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num");
268 int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code");
269 int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id");
270 int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id");
271 // asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3
272 // 0 1 2 3 4 5 6 7 8 910
273 // A 1 1 ASP 1 1 1 ASP ASP A .
274 String[] tokens = line.split("\\s+");
275 if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site)
276 int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id
277 String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num
278 String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code
279 String pdb_res_serial_with_icode = pdb_res_serial;
280 if (!pdb_ins_code.equals(".")) {
281 pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code;
282 }
283 String res_type = tokens[monIdIdx]; // mon_id
284 // sequence
285 if (aalist.contains(res_type)){
286 sequence+=AA.threeletter2oneletter(res_type);
287 } else {
288 sequence+=NONSTANDARD_AA_LETTER;
289 }
290 // pdbresser2resser
291 if (!pdb_res_serial_with_icode.equals("?")) { // question marks are author missing serials, we don't want them in the map
292 pdbresser2resser.put(pdb_res_serial_with_icode,res_serial);
293 }
294 }
295 continue;
296 }
297 // struct_conf (optional element), HELIX and TURN secondary structure
298 if (intStructConf!=null && linecount>=intStructConf.beg && linecount<=intStructConf.end){
299 int idIdx = fields2indices.get(structConfId+".id");
300 int begAuthAsymIdIdx = fields2indices.get(structConfId+".beg_auth_asym_id");
301 int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id");
302 int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id");
303 //id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4
304 // 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19
305 //HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11
306 String[] tokens = line.split("\\s+");
307 if (tokens[begAuthAsymIdIdx].equals(chainCodeStr)) { // we don't have yet chainCode (normally struct_conf appears before atom_site in cif file), so we need to use the auth_asym_id
308 String id = tokens[idIdx];
309 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
310 Matcher m = p.matcher(id);
311 String ssId="Unknown";
312 if (m.find()){
313 ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
314 }
315 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
316 int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
317 char ssType = SecStrucElement.OTHER;
318 if(id.startsWith("H")) {
319 ssType = SecStrucElement.HELIX;
320 } else if(id.startsWith("T")) {
321 ssType = SecStrucElement.TURN;
322 } else {
323 System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
324 }
325 if(ssType != SecStrucElement.OTHER) {
326 SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
327 secondaryStructure.add(ssElem);
328 }
329 }
330 continue;
331 }
332 // struct_sheet_range (optional element), SHEETs
333 if (intStructSheet!=null && linecount>=intStructSheet.beg && linecount<=intStructSheet.end){
334 int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id");
335 int idIdx = fields2indices.get(structSheetId+".id");
336 int begAuthAsymIdIdx = fields2indices.get(structSheetId+".beg_auth_asym_id");
337 int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id");
338 int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id");
339 //sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3
340 //0 1 2 3 4 5 6 7 8 910 1112 13 1415 16
341 //A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19
342 String[] tokens = line.split("\\s+");
343 if (tokens[begAuthAsymIdIdx].equals(chainCodeStr)){ // we don't have yet chainCode (normally struct_sheet_range appears before atom_site in cif file), so we need to use the auth_asym_id
344 String sheetid = tokens[sheetIdIdx];
345 int id = Integer.parseInt(tokens[idIdx]);
346 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
347 int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
348 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
349 SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
350 secondaryStructure.add(ssElem);
351 }
352 continue;
353 }
354 }
355 fcif.close();
356 if (empty) { // no atom data was found for given pdb chain code and model
357 throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model);
358 }
359 }
360 }