ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 278
Committed: Tue Aug 14 12:14:04 2007 UTC (17 years, 2 months ago) by duarte
File size: 11089 byte(s)
Log Message:
Fixed bug with getFullLength when reading from pdb file
Now if SEQRES was missing then fullLength is maximum observed residue number instead of total number of observed residue numbers
New member variables obsLength and fullLength
Line User Rev File contents
1 duarte 207 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5     import java.io.FileNotFoundException;
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11     import java.util.regex.Matcher;
12     import java.util.regex.Pattern;
13    
14 duarte 226 import javax.vecmath.Point3d;
15    
16 duarte 207 public class PdbfilePdb extends Pdb {
17    
18 duarte 208 private static final String UNKNOWN_STRING ="Unknown";
19     private static final String NULL_chainCode = "A";
20    
21 duarte 207 private String pdbfile;
22    
23     /**
24     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
25     * @param pdbfile
26     * @param pdbChainCode
27     * @throws FileNotFoundException
28     * @throws IOException
29 duarte 208 * @throws PdbfileFormatError
30 stehr 215 * @throws PdbChainCodeNotFoundError
31 duarte 207 */
32 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
33 duarte 207 this(pdbfile,pdbChainCode,DEFAULT_MODEL);
34     }
35    
36     /**
37     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
38     * @param pdbfile
39     * @param pdbChainCode
40     * @param model_serial
41     * @throws FileNotFoundException
42     * @throws IOException
43 duarte 208 * @throws PdbfileFormatError
44 stehr 215 * @throws PdbChainCodeNotFoundError
45 duarte 207 */
46 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
47 duarte 207 this.pdbfile = pdbfile;
48     this.model=model_serial;
49 duarte 208 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
50 stehr 217 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
51 duarte 208 // we set chainCode to pdbChainCode except for case NULL where we use "A"
52     this.chainCode=pdbChainCode;
53     if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode;
54    
55 duarte 258 this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file
56 duarte 208
57 stehr 274 // we initialise the secondary structure to empty, if no sec structure info is found then they remain empty
58     this.secondaryStructure = new SecondaryStructure();
59 duarte 207 read_pdb_data_from_file();
60 duarte 278
61     this.obsLength = resser2restype.size();
62    
63 stehr 274 if(!secondaryStructure.isEmpty()) {
64     secondaryStructure.setComment("Author");
65 stehr 259 }
66 duarte 208
67     // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
68     //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
69     this.resser2pdbresser = new HashMap<Integer, String>();
70     this.pdbresser2resser = new HashMap<String, Integer>();
71     for (int resser:resser2restype.keySet()){
72     resser2pdbresser.put(resser, String.valueOf(resser));
73     pdbresser2resser.put(String.valueOf(resser), resser);
74     }
75 duarte 219
76 duarte 237 // initialising atomser2atom from resser_atom2atomserial
77     atomser2atom = new HashMap<Integer, String>();
78     for (String resser_atom:resser_atom2atomserial.keySet()){
79     int atomserial = resser_atom2atomserial.get(resser_atom);
80     String atom = resser_atom.split("_")[1];
81     atomser2atom.put(atomserial,atom);
82     }
83 duarte 207 }
84    
85    
86    
87     /**
88 duarte 219 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
89     * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
90 duarte 207 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
91 duarte 258 * sequence gets set to the sequence read from ATOM lines (i.e. observed residues only)
92 duarte 219 * No insertion codes are parsed or taken into account at the moment. Thus files with
93     * insertion codes will be incorrectly read
94 duarte 207 * @param pdbfile
95     * @throws FileNotFoundException
96     * @throws IOException
97 duarte 208 * @throws PdbfileFormatError
98 stehr 215 * @throws PdbChainCodeNotFoundError
99 duarte 207 */
100 stehr 215 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
101 duarte 207 resser_atom2atomserial = new HashMap<String,Integer>();
102     resser2restype = new HashMap<Integer,String>();
103 duarte 226 atomser2coord = new HashMap<Integer,Point3d>();
104 duarte 207 atomser2resser = new HashMap<Integer,Integer>();
105 stehr 255 Pattern p;
106     Matcher m;
107 duarte 207 boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
108 duarte 208 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
109     String chainCodeStr=pdbChainCode;
110     if (pdbChainCode.equals("NULL")) chainCodeStr=" ";
111    
112 duarte 207 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
113     BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
114 duarte 208 int linecount=0;
115 duarte 207 String line;
116 stehr 255 // read first line
117     if((line = fpdb.readLine()) != null ) {
118     linecount = 1;
119 duarte 219 // HEADER
120 stehr 255 p = Pattern.compile("^HEADER");
121     m = p.matcher(line);
122 duarte 207 if (m.find()){
123     Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
124     Matcher mh = ph.matcher(line);
125     if (mh.find()) {
126     pdbCode=mh.group(1).toLowerCase();
127     }
128 stehr 255 } else { // header not found
129     // check whether this is a Casp prediction file
130     p = Pattern.compile("^PFRMAT TS");
131     m = p.matcher(line);
132     if(m.find()) {
133     // ok, it is
134     pdbCode = "CASP";
135     } else {
136     // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
137     throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format");
138     }
139 duarte 207 }
140 stehr 255 } else {
141     throw new PdbfileFormatError("The file "+pdbfile+" is empty.");
142     }
143     // read further lines
144     while ((line = fpdb.readLine() ) != null ) {
145     linecount++;
146 duarte 258 // SEQRES
147     //SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE
148     p = Pattern.compile("^SEQRES.{5}"+chainCodeStr);
149     m = p.matcher(line);
150     if (m.find()){
151     for (int i=19;i<=67;i+=4) {
152     if (!line.substring(i, i+3).equals(" ")) {
153 duarte 261 if (AA.threeletter2oneletter(line.substring(i, i+3))!=null) { // for non-standard aas
154     sequence+= AA.threeletter2oneletter(line.substring(i, i+3));
155 duarte 262 } else {
156     sequence+= NONSTANDARD_AA_LETTER;
157 duarte 261 }
158 duarte 258 }
159     }
160     }
161 duarte 219 // SECONDARY STRUCTURE
162     // helix
163     //HELIX 1 1 LYS A 17 LEU A 26 1
164     // helix ser beg res ser end res ser
165     p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
166     m = p.matcher(line);
167     if (m.find()){
168     int serial = Integer.valueOf(m.group(1).trim());
169     int beg = Integer.valueOf(m.group(2).trim());
170     int end = Integer.valueOf(m.group(3).trim());
171 stehr 274 String ssId = new Character(SecStrucElement.HELIX).toString()+serial;
172     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.HELIX,beg,end,ssId);
173     secondaryStructure.add(ssElem);
174 duarte 219 }
175     // sheet
176     //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
177     // strand ser sheet id beg res ser end res ser
178     p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
179     m = p.matcher(line);
180     if (m.find()){
181     int strandSerial = Integer.valueOf(m.group(1).trim());
182     String sheetId = m.group(2).trim();
183     int beg = Integer.valueOf(m.group(3).trim());
184     int end = Integer.valueOf(m.group(4).trim());
185 stehr 274 String ssId = new Character(SecStrucElement.STRAND).toString()+sheetId+strandSerial;
186     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.STRAND,beg,end,ssId);
187     secondaryStructure.add(ssElem);
188 duarte 219 }
189     // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
190     // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
191     // turn
192     //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
193     // turn ser beg res ser end res ser
194     p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
195     m = p.matcher(line);
196     if (m.find()){
197     int serial = Integer.valueOf(m.group(1).trim());
198     int beg = Integer.valueOf(m.group(2).trim());
199     int end = Integer.valueOf(m.group(3).trim());
200 stehr 274 String ssId = new Character(SecStrucElement.TURN).toString()+serial;
201     SecStrucElement ssElem = new SecStrucElement(SecStrucElement.TURN,beg,end,ssId);
202     secondaryStructure.add(ssElem);
203 duarte 219 }
204     // MODEL
205 duarte 207 p = Pattern.compile("^MODEL\\s+(\\d+)");
206     m = p.matcher(line);
207     if (m.find()){
208     thismodel=Integer.parseInt(m.group(1));
209     }
210     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
211 duarte 219 // ATOM
212 duarte 207 p = Pattern.compile("^ATOM");
213     m = p.matcher(line);
214     if (m.find()){
215 duarte 208 // serial atom res_type chain res_ser x y z
216 duarte 263 Pattern pl = Pattern.compile("^.{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
217 duarte 207 Matcher ml = pl.matcher(line);
218     if (ml.find()) {
219     empty=false;
220     int atomserial=Integer.parseInt(ml.group(1).trim());
221     String atom = ml.group(2).trim();
222     String res_type = ml.group(3).trim();
223     int res_serial = Integer.parseInt(ml.group(4).trim());
224     double x = Double.parseDouble(ml.group(5).trim());
225     double y = Double.parseDouble(ml.group(6).trim());
226     double z = Double.parseDouble(ml.group(7).trim());
227 duarte 226 Point3d coords = new Point3d(x,y,z);
228 duarte 207 ArrayList<String> aalist=AA.aas();
229     if (aalist.contains(res_type)) {
230     atomser2coord.put(atomserial, coords);
231     atomser2resser.put(atomserial, res_serial);
232     resser2restype.put(res_serial, res_type);
233     ArrayList<String> atomlist = aas2atoms.get(res_type);
234 duarte 275 atomlist.add("OXT"); // the extra atom OXT is there in the last residue of the chain
235 duarte 207 if (atomlist.contains(atom)){
236     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
237     }
238     }
239     }
240     }
241     }
242     fpdb.close();
243 stehr 215 if (empty) {
244     //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
245     throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
246     }
247 duarte 258 if (sequence.equals("")){
248     // if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap
249     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
250     ArrayList<Integer> ressers = new ArrayList<Integer>();
251     for (int resser:resser2restype.keySet()) {
252     ressers.add(resser);
253     }
254     Collections.sort(ressers);
255     for (int resser:ressers){
256     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
257     sequence += oneletter;
258     }
259 duarte 278 // not size but maximum: if residue numbering in pdb file is correct, then this takes care of non-observed except for possible non-observed at end of chain
260     fullLength = Collections.max(resser2restype.keySet());
261     } else { // we read the sequence from SEQRES
262     fullLength = sequence.length();
263 duarte 207 }
264     }
265 duarte 219
266 duarte 207 }