ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/CiffilePdb.java
Revision: 309
Committed: Thu Aug 30 15:55:53 2007 UTC (17 years, 1 month ago) by duarte
File size: 16999 byte(s)
Log Message:
Fixed bug: needed to read alt locs in advance in another scan of the file because the order of the elements in the cif file is not guaranteed. As read of atom_site needs of alt locs, we need to do first the parsing of atom_sites_alt
Line User Rev File contents
1 duarte 306 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5    
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11     import java.util.TreeMap;
12     import java.util.TreeSet;
13     import java.util.regex.Matcher;
14     import java.util.regex.Pattern;
15    
16     import javax.vecmath.Point3d;
17    
18    
19     /**
20     * A single chain pdb protein structure loaded from a mmCIF file
21     *
22     * @author Jose Duarte
23     * Class: CiffilePdb
24     * Package: proteinstructure
25     */
26     public class CiffilePdb extends Pdb {
27    
28    
29     private String ciffile;
30 duarte 307
31     // fields we will read
32     private static final String entryId = "_entry";
33     private static final String atomSiteId = "_atom_site";
34     private static final String atomSitesAltId = "_atom_sites_alt";
35     private static final String pdbxPolySeqId = "_pdbx_poly_seq_scheme";
36     private static final String structConfId = "_struct_conf";
37     private static final String structSheetId = "_struct_sheet_range";
38     private static final String[] ids = {entryId,atomSitesAltId,atomSiteId,pdbxPolySeqId,structConfId,structSheetId};
39 duarte 306
40     private TreeMap<String,Integer> ids2elements; // map of ids to element serials
41 duarte 307 private TreeMap<String,String> fields2values; // map of field names (id.field) to values (for non-loop elements)
42     private TreeMap<String,Integer> fields2indices; // map of field names (id.field) to index (for loop elements)
43 duarte 306 private TreeSet<Integer> loopElements; // contains list of elements that are of loop type
44     private TreeMap<Integer,Interval> loopelements2contentIndex; // begin and end line index of each loop element
45 duarte 309
46     private String altLoc;
47 duarte 307
48 duarte 306 /**
49     * Constructs Pdb object given pdb code and pdb chain code.
50     * Model will be DEFAULT_MODEL
51     * @param ciffile
52     * @param pdbChainCode
53     * @throws PdbChainCodeNotFoundError
54     * @throws IOException
55     */
56     public CiffilePdb (String ciffile, String pdbChainCode) throws PdbChainCodeNotFoundError, IOException {
57     this(ciffile, pdbChainCode, DEFAULT_MODEL);
58     }
59    
60     /**
61     * Constructs Pdb object given pdb code, pdb chain code, model serial, source db and a MySQLConnection.
62     * The db must be a pdbase database
63     * @param ciffile
64     * @param pdbChainCode
65     * @param model_serial
66     * @throws PdbChainCodeNotFoundError
67     * @throws IOException
68     */
69     public CiffilePdb (String ciffile, String pdbChainCode, int model_serial) throws PdbChainCodeNotFoundError, IOException {
70     this.ciffile = ciffile;
71     this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
72     this.model=model_serial;
73    
74     parseCifFile();
75    
76     this.pdbCode = readPdbCode();
77    
78 duarte 309 readAtomAltLocs(); // sets altLoc String (needed in parseLoopElements to get the right alt atom locations in reading atom_site)
79    
80 duarte 306 secondaryStructure = new SecondaryStructure(); // create empty secondary structure first to make sure object is not null
81    
82     parseLoopElements(); // populates resser_atom2atomserial, resser2restype, atomser2coord, atomser2resser, sequence, pdbresser2resser, secondaryStructure
83    
84     this.fullLength = sequence.length();
85    
86     this.obsLength = resser2restype.size();
87    
88     if(!secondaryStructure.isEmpty()) {
89     secondaryStructure.setComment("CIFfile");
90     }
91    
92     // we initialise resser2pdbresser from the pdbresser2resser HashMap
93     this.resser2pdbresser = new HashMap<Integer, String>();
94     for (String pdbresser:pdbresser2resser.keySet()){
95     resser2pdbresser.put(pdbresser2resser.get(pdbresser), pdbresser);
96     }
97    
98     // initialising atomser2atom from resser_atom2atomserial
99     atomser2atom = new HashMap<Integer, String>();
100     for (String resser_atom:resser_atom2atomserial.keySet()){
101     int atomserial = resser_atom2atomserial.get(resser_atom);
102     String atom = resser_atom.split("_")[1];
103     atomser2atom.put(atomserial,atom);
104     }
105     }
106    
107     private void parseCifFile() throws IOException{
108     // data structures to store the parsed fields
109     ids2elements = new TreeMap<String, Integer>();
110 duarte 307 fields2indices = new TreeMap<String,Integer>();
111 duarte 306 fields2values = new TreeMap<String, String>();
112     loopElements = new TreeSet<Integer>(); // contains list of elements that are of loop type
113     loopelements2contentIndex = new TreeMap<Integer,Interval>();
114 duarte 307 TreeMap<String,Integer> fieldsIdx = new TreeMap<String,Integer>(); // this map holds the field index counters for each element id
115 duarte 306
116     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
117     int element = 0;
118     String line;
119     line = fcif.readLine(); // skip first line
120     int linecount = 1; // we have read one line already, we initialise count to 1
121     while((line = fcif.readLine()) != null ) {
122     linecount++;
123     if (line.startsWith("#")) {
124     element++;
125     continue;
126     }
127     if (line.startsWith("loop_")) {
128     loopElements.add(element);
129     continue;
130     }
131    
132     for (String id:ids){
133 duarte 307 if (!fieldsIdx.containsKey(id)) fieldsIdx.put(id,0);
134 duarte 306 Pattern p = Pattern.compile("^"+id+"\\.(\\w+)(?:\\s+(.*))?$");
135     Matcher m = p.matcher(line);
136     if (m.find()){
137     ids2elements.put(id,element);
138 duarte 307 String field = id + "." + m.group(1);
139 duarte 306 if (!loopElements.contains(element)) { // if not a loop element
140 duarte 307 fields2values.put(field, m.group(2)); // 2nd capture group only matches for non-loops where the value of the field is in same line as field name
141     } else { // for loop elements we fill the fields2indices TreeMap
142     fields2indices.put(field,fieldsIdx.get(id));
143 duarte 306 }
144 duarte 307 fieldsIdx.put(id,fieldsIdx.get(id)+1);
145 duarte 306 continue;
146     }
147     }
148     if (!line.startsWith("_") && !line.startsWith("#")){ // not in field definition, we are in values of a loop element
149     if (ids2elements.containsValue(element)) { // if this is one of the fields we want to parse (members of String[] ids)
150     if (!loopelements2contentIndex.containsKey(element)) {
151     //loopelements2content.put(element,line+"\n");
152     Interval interval = new Interval(linecount,linecount);
153     loopelements2contentIndex.put(element,interval);
154     } else {
155     //loopelements2content.put(element,loopelements2content.get(element)+line+"\n");
156     loopelements2contentIndex.get(element).end=linecount;
157     }
158     }
159     }
160     } // end scanning lines
161    
162     fcif.close();
163     }
164    
165     private String readPdbCode(){
166     return fields2values.get("_entry.id").trim();
167     }
168    
169 duarte 309 private void readAtomAltLocs() throws IOException {
170     // The read of the atom_sites_alt element must be done in a separate scan of the file, previous to scanning the atom_site element
171     // This is because the order of the different elements in the cif files is not guaranteed, so atom_sites_alt can come before or after atom_site
172     // (and altLoc needs to be set before starting reading the atom_site element)
173    
174     ArrayList<String> altLocs = new ArrayList<String>();
175     // we initialise to ".", this is the default value in the cif files for the alt loc field. If no atom_sites_alt is present it's ok to stay with this value
176     altLoc = ".";
177    
178     // atom_sites_alt element is optional
179     Interval intAtomSitesAlt = null;
180     if (ids2elements.containsKey(atomSitesAltId)){
181     intAtomSitesAlt = loopelements2contentIndex.get(ids2elements.get(atomSitesAltId));
182     }
183    
184     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
185     String line;
186     int linecount=0;
187     while((line = fcif.readLine()) != null ) {
188     linecount++;
189     // atom_sites_alt (optional element)
190     if (intAtomSitesAlt!=null && linecount>=intAtomSitesAlt.beg && linecount<=intAtomSitesAlt.end){
191     int idIdx = fields2indices.get(atomSitesAltId+".id");
192     // id=0
193     // A ?
194     String[] tokens = line.split("\\s+");
195     if (!tokens[idIdx].equals(".")) {
196     altLocs.add(tokens[idIdx]);
197     }
198     }
199     }
200     fcif.close();
201     if (!altLocs.isEmpty()){
202     altLoc = Collections.min(altLocs);
203     }
204     }
205    
206 duarte 306 private void parseLoopElements() throws IOException, PdbChainCodeNotFoundError {
207     resser_atom2atomserial = new HashMap<String,Integer>();
208     resser2restype = new HashMap<Integer,String>();
209     atomser2coord = new HashMap<Integer,Point3d>();
210     atomser2resser = new HashMap<Integer,Integer>();
211     pdbresser2resser = new HashMap<String, Integer>();
212     sequence = "";
213     secondaryStructure = new SecondaryStructure();
214    
215     ArrayList<String> aalist=AA.aas(); // list of standard 3 letter code aminoacids
216    
217     String chainCodeStr=pdbChainCode;
218     if (pdbChainCode.equals("NULL")) chainCodeStr="A";
219    
220 duarte 307 Interval intAtomSite = loopelements2contentIndex.get(ids2elements.get(atomSiteId));
221     Interval intPdbxPoly = loopelements2contentIndex.get(ids2elements.get(pdbxPolySeqId));
222 duarte 306 // struct_conf element is optional
223     Interval intStructConf = null;
224 duarte 307 if (ids2elements.containsKey(structConfId)){
225     intStructConf = loopelements2contentIndex.get(ids2elements.get(structConfId));
226 duarte 306 }
227     // struct_sheet_range element is optional
228     Interval intStructSheet = null;
229 duarte 307 if (ids2elements.containsKey(structSheetId)) {
230     intStructSheet = loopelements2contentIndex.get(ids2elements.get(structSheetId));
231 duarte 306 }
232    
233     boolean empty = true;
234     BufferedReader fcif = new BufferedReader(new FileReader(new File(ciffile)));
235     String line;
236     int linecount=0;
237     while((line = fcif.readLine()) != null ) {
238     linecount++;
239     // atom_site
240 duarte 307 if (linecount>=intAtomSite.beg && linecount<=intAtomSite.end){
241 duarte 308 int groupPdbIdx = fields2indices.get(atomSiteId+".group_PDB");
242 duarte 307 int idIdx = fields2indices.get(atomSiteId+".id");
243     int labelAtomIdIdx = fields2indices.get(atomSiteId+".label_atom_id");
244     int labelAltIdIdx = fields2indices.get(atomSiteId+".label_alt_id");
245     int labelCompIdIdx = fields2indices.get(atomSiteId+".label_comp_id");
246     int labelAsymIdIdx = fields2indices.get(atomSiteId+".label_asym_id");
247     int labelSeqIdIdx = fields2indices.get(atomSiteId+".label_seq_id");
248     int cartnXIdx = fields2indices.get(atomSiteId+".Cartn_x");
249     int cartnYIdx = fields2indices.get(atomSiteId+".Cartn_y");
250     int cartnZIdx = fields2indices.get(atomSiteId+".Cartn_z");
251     int authAsymIdIdx = fields2indices.get(atomSiteId+".auth_asym_id");
252     int pdbxPDBModelNumIdx = fields2indices.get(atomSiteId+".pdbx_PDB_model_num");
253 duarte 308 // group_PDB=0, auth_asym_id=22, pdbx_PDB_model_num=24, label_alt_id=4, id=1, label_atom_id=3, label_comp_id=5, label_asym_id=6, label_seq_id=8, Cartn_x=10, Cartn_y=11, Cartn_z=12
254 duarte 306 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 151617181920 2122 23 24
255     //ATOM 2 C CA . MET A 1 1 ? 38.591 8.543 15.660 1.00 77.79 ? ? ? ? ? 1 MET A CA 1
256     String[] tokens = line.split("\\s+");
257 duarte 308 if (tokens[groupPdbIdx].equals("ATOM") && tokens[authAsymIdIdx].equals(chainCodeStr) && Integer.parseInt(tokens[pdbxPDBModelNumIdx])==model) { // match our given chain and model
258 duarte 306 empty = false;
259 duarte 307 if (tokens[labelAltIdIdx].equals(".") || tokens[labelAltIdIdx].equals(altLoc)) { // don't read lines with something else as "." or altLoc
260     int atomserial=Integer.parseInt(tokens[idIdx]); // id
261     String atom = tokens[labelAtomIdIdx]; // label_atom_id
262     String res_type = tokens[labelCompIdIdx]; // label_comp_id
263     chainCode = tokens[labelAsymIdIdx]; // label_asym_id
264     int res_serial = Integer.parseInt(tokens[labelSeqIdIdx]); // label_seq_id
265     double x = Double.parseDouble(tokens[cartnXIdx]); // Cartn_x
266     double y = Double.parseDouble(tokens[cartnYIdx]); // Cartn_y
267     double z = Double.parseDouble(tokens[cartnZIdx]); // Cartn_z
268 duarte 306 Point3d coords = new Point3d(x,y,z);
269     if (aalist.contains(res_type)) {
270     atomser2coord.put(atomserial, coords);
271     atomser2resser.put(atomserial, res_serial);
272     resser2restype.put(res_serial, res_type);
273     ArrayList<String> atomlist = aas2atoms.get(res_type);
274     atomlist.add("OXT"); // the extra atom OXT is there in the last residue of the chain
275     if (atomlist.contains(atom)){
276     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
277     }
278     }
279     }
280     }
281     continue;
282     }
283     // pdbx_poly_seq_scheme
284     if (linecount>=intPdbxPoly.beg && linecount<=intPdbxPoly.end){
285 duarte 307 int seqIdIdx = fields2indices.get(pdbxPolySeqId+".seq_id");
286     int authSeqNumIdx = fields2indices.get(pdbxPolySeqId+".auth_seq_num");
287     int pdbInsCodeIdx = fields2indices.get(pdbxPolySeqId+".pdb_ins_code");
288     int monIdIdx = fields2indices.get(pdbxPolySeqId+".mon_id");
289 duarte 308 int pdbStrandIdIdx = fields2indices.get(pdbxPolySeqId+".pdb_strand_id");
290 duarte 306 // asym_id=0, seq_id=2, auth_seq_num=6, pdb_ins_code=10, mon_id=3
291     // 0 1 2 3 4 5 6 7 8 910
292     // A 1 1 ASP 1 1 1 ASP ASP A .
293     String[] tokens = line.split("\\s+");
294 duarte 308 if (tokens[pdbStrandIdIdx].equals(chainCodeStr)) { // we can't rely on using chainCode, because the order of elements is not guranteed (pdbx_poly_seq_scheme doesn't always come after atom_site)
295 duarte 307 int res_serial = Integer.parseInt(tokens[seqIdIdx]); // seq_id
296     String pdb_res_serial = tokens[authSeqNumIdx]; // auth_seq_num
297     String pdb_ins_code = tokens[pdbInsCodeIdx]; // pdb_ins_code
298 duarte 306 String pdb_res_serial_with_icode = pdb_res_serial;
299     if (!pdb_ins_code.equals(".")) {
300     pdb_res_serial_with_icode=pdb_res_serial+pdb_ins_code;
301     }
302 duarte 307 String res_type = tokens[monIdIdx]; // mon_id
303 duarte 306 // sequence
304     if (aalist.contains(res_type)){
305     sequence+=AA.threeletter2oneletter(res_type);
306     } else {
307     sequence+=NONSTANDARD_AA_LETTER;
308     }
309     // pdbresser2resser
310 duarte 308 if (!pdb_res_serial_with_icode.equals("?")) { // question marks are author missing serials, we don't want them in the map
311     pdbresser2resser.put(pdb_res_serial_with_icode,res_serial);
312     }
313 duarte 306 }
314     continue;
315     }
316     // struct_conf (optional element), HELIX and TURN secondary structure
317     if (intStructConf!=null && linecount>=intStructConf.beg && linecount<=intStructConf.end){
318 duarte 307 int idIdx = fields2indices.get(structConfId+".id");
319 duarte 308 int begAuthAsymIdIdx = fields2indices.get(structConfId+".beg_auth_asym_id");
320 duarte 307 int begLabelSeqIdIdx = fields2indices.get(structConfId+".beg_label_seq_id");
321     int endLabelSeqIdIdx = fields2indices.get(structConfId+".end_label_seq_id");
322 duarte 306 //id=1, beg_label_seq_id=5, end_label_seq_id=9, beg_label_asym_id=4
323     // 0 1 2 3 4 5 6 7 8 9 10 111213 1415 16 1718 19
324     //HELX_P HELX_P1 1 ASN A 2 ? GLY A 12 ? ASN A 2 GLY A 12 1 ? 11
325     String[] tokens = line.split("\\s+");
326 duarte 308 if (tokens[begAuthAsymIdIdx].equals(chainCodeStr)) { // we don't have yet chainCode (normally struct_conf appears before atom_site in cif file), so we need to use the auth_asym_id
327 duarte 307 String id = tokens[idIdx];
328 duarte 306 Pattern p = Pattern.compile("^(\\w).+_P(\\d)+$");
329     Matcher m = p.matcher(id);
330     String ssId="Unknown";
331     if (m.find()){
332     ssId = m.group(1)+m.group(2); // e.g.: Hnn (helices) or Tnn (turns)
333     }
334 duarte 307 int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
335     int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
336 duarte 306 char ssType = SecStrucElement.OTHER;
337     if(id.startsWith("H")) {
338     ssType = SecStrucElement.HELIX;
339     } else if(id.startsWith("T")) {
340     ssType = SecStrucElement.TURN;
341     } else {
342     System.err.println("Unknown secondary structure type " + id + " encountered when reading from ciffile. Skipping.");
343     }
344     if(ssType != SecStrucElement.OTHER) {
345     SecStrucElement ssElem = new SecStrucElement(ssType, beg, end, ssId);
346     secondaryStructure.add(ssElem);
347     }
348     }
349     continue;
350     }
351     // struct_sheet_range (optional element), SHEETs
352     if (intStructSheet!=null && linecount>=intStructSheet.beg && linecount<=intStructSheet.end){
353 duarte 307 int sheetIdIdx = fields2indices.get(structSheetId+".sheet_id");
354     int idIdx = fields2indices.get(structSheetId+".id");
355 duarte 308 int begAuthAsymIdIdx = fields2indices.get(structSheetId+".beg_auth_asym_id");
356 duarte 307 int begLabelSeqIdIdx = fields2indices.get(structSheetId+".beg_label_seq_id");
357     int endLabelSeqIdIdx = fields2indices.get(structSheetId+".end_label_seq_id");
358 duarte 306 //sheet_id=0, id=1, beg_label_seq_id=4, end_label_seq_id=8, beg_label_asym_id=3
359     //0 1 2 3 4 5 6 7 8 910 1112 13 1415 16
360     //A 1 ARG A 14 ? LYS A 19 ? ? ARG A 14 LYS A 19
361     String[] tokens = line.split("\\s+");
362 duarte 308 if (tokens[begAuthAsymIdIdx].equals(chainCodeStr)){ // we don't have yet chainCode (normally struct_sheet_range appears before atom_site in cif file), so we need to use the auth_asym_id
363 duarte 307 String sheetid = tokens[sheetIdIdx];
364     int id = Integer.parseInt(tokens[idIdx]);
365     int beg = Integer.parseInt(tokens[begLabelSeqIdIdx]);
366     int end = Integer.parseInt(tokens[endLabelSeqIdIdx]);
367 duarte 306 String ssId=SecStrucElement.STRAND+sheetid+id; // e.g.: SA1, SA2..., SB1, SB2,...
368     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND, beg, end, ssId);
369     secondaryStructure.add(ssElem);
370     }
371     continue;
372     }
373     }
374     fcif.close();
375     if (empty) { // no atom data was found for given pdb chain code and model
376     throw new PdbChainCodeNotFoundError("Couldn't find _atom_site data for given pdbChainCode: "+pdbChainCode+", model: "+model);
377     }
378     }
379     }