ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 263
Committed: Mon Aug 13 11:48:23 2007 UTC (17 years, 2 months ago) by duarte
File size: 11549 byte(s)
Log Message:
Fixed potential bug in regex for ATOM lines
Line File contents
1 package proteinstructure;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.util.ArrayList;
9 import java.util.Collections;
10 import java.util.HashMap;
11 import java.util.TreeMap;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 import javax.vecmath.Point3d;
16
17 public class PdbfilePdb extends Pdb {
18
19 private static final String UNKNOWN_STRING ="Unknown";
20 private static final String NULL_chainCode = "A";
21
22 private String pdbfile;
23
24 /**
25 * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
26 * @param pdbfile
27 * @param pdbChainCode
28 * @throws FileNotFoundException
29 * @throws IOException
30 * @throws PdbfileFormatError
31 * @throws PdbChainCodeNotFoundError
32 */
33 public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
34 this(pdbfile,pdbChainCode,DEFAULT_MODEL);
35 }
36
37 /**
38 * Constructs Pdb object reading from pdb file given pdb chain code and model serial
39 * @param pdbfile
40 * @param pdbChainCode
41 * @param model_serial
42 * @throws FileNotFoundException
43 * @throws IOException
44 * @throws PdbfileFormatError
45 * @throws PdbChainCodeNotFoundError
46 */
47 public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
48 this.pdbfile = pdbfile;
49 this.model=model_serial;
50 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
51 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
52 // we set chainCode to pdbChainCode except for case NULL where we use "A"
53 this.chainCode=pdbChainCode;
54 if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode;
55
56 this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file
57
58 // we initialise the resser2secstruct and secstruct2resinterval Maps to empty, if no sec structure info is found then it remains empty
59 this.resser2secstruct = new HashMap<Integer, String>();
60 this.secstruct2resinterval = new TreeMap<String, Interval>();
61
62 read_pdb_data_from_file();
63 if(!resser2secstruct.isEmpty()) {
64 hasSecondaryStructure = true;
65 secondaryStructureSource = "Author";
66 }
67
68 // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
69 //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
70 this.resser2pdbresser = new HashMap<Integer, String>();
71 this.pdbresser2resser = new HashMap<String, Integer>();
72 for (int resser:resser2restype.keySet()){
73 resser2pdbresser.put(resser, String.valueOf(resser));
74 pdbresser2resser.put(String.valueOf(resser), resser);
75 }
76
77 // initialising atomser2atom from resser_atom2atomserial
78 atomser2atom = new HashMap<Integer, String>();
79 for (String resser_atom:resser_atom2atomserial.keySet()){
80 int atomserial = resser_atom2atomserial.get(resser_atom);
81 String atom = resser_atom.split("_")[1];
82 atomser2atom.put(atomserial,atom);
83 }
84 }
85
86
87
88 /**
89 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
90 * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
91 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
92 * sequence gets set to the sequence read from ATOM lines (i.e. observed residues only)
93 * No insertion codes are parsed or taken into account at the moment. Thus files with
94 * insertion codes will be incorrectly read
95 * @param pdbfile
96 * @throws FileNotFoundException
97 * @throws IOException
98 * @throws PdbfileFormatError
99 * @throws PdbChainCodeNotFoundError
100 */
101 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
102 resser_atom2atomserial = new HashMap<String,Integer>();
103 resser2restype = new HashMap<Integer,String>();
104 atomser2coord = new HashMap<Integer,Point3d>();
105 atomser2resser = new HashMap<Integer,Integer>();
106 Pattern p;
107 Matcher m;
108 boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
109 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
110 String chainCodeStr=pdbChainCode;
111 if (pdbChainCode.equals("NULL")) chainCodeStr=" ";
112
113 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
114 BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
115 int linecount=0;
116 String line;
117 // read first line
118 if((line = fpdb.readLine()) != null ) {
119 linecount = 1;
120 // HEADER
121 p = Pattern.compile("^HEADER");
122 m = p.matcher(line);
123 if (m.find()){
124 Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
125 Matcher mh = ph.matcher(line);
126 if (mh.find()) {
127 pdbCode=mh.group(1).toLowerCase();
128 }
129 } else { // header not found
130 // check whether this is a Casp prediction file
131 p = Pattern.compile("^PFRMAT TS");
132 m = p.matcher(line);
133 if(m.find()) {
134 // ok, it is
135 pdbCode = "CASP";
136 } else {
137 // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
138 throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format");
139 }
140 }
141 } else {
142 throw new PdbfileFormatError("The file "+pdbfile+" is empty.");
143 }
144 // read further lines
145 while ((line = fpdb.readLine() ) != null ) {
146 linecount++;
147 // SEQRES
148 //SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE
149 p = Pattern.compile("^SEQRES.{5}"+chainCodeStr);
150 m = p.matcher(line);
151 if (m.find()){
152 for (int i=19;i<=67;i+=4) {
153 if (!line.substring(i, i+3).equals(" ")) {
154 if (AA.threeletter2oneletter(line.substring(i, i+3))!=null) { // for non-standard aas
155 sequence+= AA.threeletter2oneletter(line.substring(i, i+3));
156 } else {
157 sequence+= NONSTANDARD_AA_LETTER;
158 }
159 }
160 }
161 }
162 // SECONDARY STRUCTURE
163 // helix
164 //HELIX 1 1 LYS A 17 LEU A 26 1
165 // helix ser beg res ser end res ser
166 p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
167 m = p.matcher(line);
168 if (m.find()){
169 int serial = Integer.valueOf(m.group(1).trim());
170 int beg = Integer.valueOf(m.group(2).trim());
171 int end = Integer.valueOf(m.group(3).trim());
172 String ssId = "H"+serial;
173 secstruct2resinterval.put(ssId, new Interval(beg,end));
174 for (int i=beg;i<=end;i++){
175 if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
176 //System.err.println("Inconsistency in secondary structure assignment. " +
177 // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
178 }
179 resser2secstruct.put(i,ssId);
180 }
181 }
182 // sheet
183 //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
184 // strand ser sheet id beg res ser end res ser
185 p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
186 m = p.matcher(line);
187 if (m.find()){
188 int strandSerial = Integer.valueOf(m.group(1).trim());
189 String sheetId = m.group(2).trim();
190 int beg = Integer.valueOf(m.group(3).trim());
191 int end = Integer.valueOf(m.group(4).trim());
192 String ssId = "S"+sheetId+strandSerial;
193 secstruct2resinterval.put(ssId, new Interval(beg,end));
194 for (int i=beg;i<=end;i++){
195 if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
196 //System.err.println("Inconsistency in secondary structure assignment. " +
197 // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
198 }
199 resser2secstruct.put(i,ssId);
200 }
201 }
202 // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
203 // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
204 // turn
205 //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
206 // turn ser beg res ser end res ser
207 p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
208 m = p.matcher(line);
209 if (m.find()){
210 int serial = Integer.valueOf(m.group(1).trim());
211 int beg = Integer.valueOf(m.group(2).trim());
212 int end = Integer.valueOf(m.group(3).trim());
213 String ssId = "T"+serial;
214 secstruct2resinterval.put(ssId, new Interval(beg,end));
215 for (int i=beg;i<=end;i++){
216 if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
217 //System.err.println("Inconsistency in secondary structure assignment. " +
218 // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
219 }
220 resser2secstruct.put(i,ssId);
221 }
222 }
223 // MODEL
224 p = Pattern.compile("^MODEL\\s+(\\d+)");
225 m = p.matcher(line);
226 if (m.find()){
227 thismodel=Integer.parseInt(m.group(1));
228 }
229 if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
230 // ATOM
231 p = Pattern.compile("^ATOM");
232 m = p.matcher(line);
233 if (m.find()){
234 // serial atom res_type chain res_ser x y z
235 Pattern pl = Pattern.compile("^.{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
236 Matcher ml = pl.matcher(line);
237 if (ml.find()) {
238 empty=false;
239 int atomserial=Integer.parseInt(ml.group(1).trim());
240 String atom = ml.group(2).trim();
241 String res_type = ml.group(3).trim();
242 int res_serial = Integer.parseInt(ml.group(4).trim());
243 double x = Double.parseDouble(ml.group(5).trim());
244 double y = Double.parseDouble(ml.group(6).trim());
245 double z = Double.parseDouble(ml.group(7).trim());
246 Point3d coords = new Point3d(x,y,z);
247 ArrayList<String> aalist=AA.aas();
248 if (aalist.contains(res_type)) {
249 atomser2coord.put(atomserial, coords);
250 atomser2resser.put(atomserial, res_serial);
251 resser2restype.put(res_serial, res_type);
252 ArrayList<String> atomlist = aas2atoms.get(res_type);
253 if (atomlist.contains(atom)){
254 resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
255 }
256 }
257 }
258 }
259 }
260 fpdb.close();
261 if (empty) {
262 //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
263 throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
264 }
265 if (sequence.equals("")){
266 // if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap
267 // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
268 ArrayList<Integer> ressers = new ArrayList<Integer>();
269 for (int resser:resser2restype.keySet()) {
270 ressers.add(resser);
271 }
272 Collections.sort(ressers);
273 for (int resser:ressers){
274 String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
275 sequence += oneletter;
276 }
277 }
278 }
279
280 }