ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 258
Committed: Fri Aug 10 16:06:53 2007 UTC (17 years, 1 month ago) by duarte
File size: 11278 byte(s)
Log Message:
Now reading sequence from SEQRES lines. This fixes bug in cmview window was not showing whole contact map because of size of contact map was set to number of observed residues size
Line User Rev File contents
1 duarte 207 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5     import java.io.FileNotFoundException;
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11 duarte 222 import java.util.TreeMap;
12 duarte 207 import java.util.regex.Matcher;
13     import java.util.regex.Pattern;
14    
15 duarte 226 import javax.vecmath.Point3d;
16    
17 duarte 207 public class PdbfilePdb extends Pdb {
18    
19 duarte 208 private static final String UNKNOWN_STRING ="Unknown";
20     private static final String NULL_chainCode = "A";
21    
22 duarte 207 private String pdbfile;
23    
24     /**
25     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
26     * @param pdbfile
27     * @param pdbChainCode
28     * @throws FileNotFoundException
29     * @throws IOException
30 duarte 208 * @throws PdbfileFormatError
31 stehr 215 * @throws PdbChainCodeNotFoundError
32 duarte 207 */
33 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
34 duarte 207 this(pdbfile,pdbChainCode,DEFAULT_MODEL);
35     }
36    
37     /**
38     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
39     * @param pdbfile
40     * @param pdbChainCode
41     * @param model_serial
42     * @throws FileNotFoundException
43     * @throws IOException
44 duarte 208 * @throws PdbfileFormatError
45 stehr 215 * @throws PdbChainCodeNotFoundError
46 duarte 207 */
47 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
48 duarte 207 this.pdbfile = pdbfile;
49     this.model=model_serial;
50 duarte 208 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
51 stehr 217 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
52 duarte 208 // we set chainCode to pdbChainCode except for case NULL where we use "A"
53     this.chainCode=pdbChainCode;
54     if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode;
55    
56 duarte 258 this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file
57 duarte 208
58 duarte 222 // we initialise the resser2secstruct and secstruct2resinterval Maps to empty, if no sec structure info is found then it remains empty
59 duarte 219 this.resser2secstruct = new HashMap<Integer, String>();
60 duarte 222 this.secstruct2resinterval = new TreeMap<String, Interval>();
61 duarte 219
62 duarte 207 read_pdb_data_from_file();
63 duarte 208
64     // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
65     //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
66     this.resser2pdbresser = new HashMap<Integer, String>();
67     this.pdbresser2resser = new HashMap<String, Integer>();
68     for (int resser:resser2restype.keySet()){
69     resser2pdbresser.put(resser, String.valueOf(resser));
70     pdbresser2resser.put(String.valueOf(resser), resser);
71     }
72 duarte 219
73 duarte 237 // initialising atomser2atom from resser_atom2atomserial
74     atomser2atom = new HashMap<Integer, String>();
75     for (String resser_atom:resser_atom2atomserial.keySet()){
76     int atomserial = resser_atom2atomserial.get(resser_atom);
77     String atom = resser_atom.split("_")[1];
78     atomser2atom.put(atomserial,atom);
79     }
80 duarte 207 }
81    
82    
83    
84     /**
85 duarte 219 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
86     * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
87 duarte 207 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
88 duarte 258 * sequence gets set to the sequence read from ATOM lines (i.e. observed residues only)
89 duarte 219 * No insertion codes are parsed or taken into account at the moment. Thus files with
90     * insertion codes will be incorrectly read
91 duarte 207 * @param pdbfile
92     * @throws FileNotFoundException
93     * @throws IOException
94 duarte 208 * @throws PdbfileFormatError
95 stehr 215 * @throws PdbChainCodeNotFoundError
96 duarte 207 */
97 stehr 215 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
98 duarte 207 resser_atom2atomserial = new HashMap<String,Integer>();
99     resser2restype = new HashMap<Integer,String>();
100 duarte 226 atomser2coord = new HashMap<Integer,Point3d>();
101 duarte 207 atomser2resser = new HashMap<Integer,Integer>();
102 stehr 255 Pattern p;
103     Matcher m;
104 duarte 207 boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
105 duarte 208 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
106     String chainCodeStr=pdbChainCode;
107     if (pdbChainCode.equals("NULL")) chainCodeStr=" ";
108    
109 duarte 207 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
110     BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
111 duarte 208 int linecount=0;
112 duarte 207 String line;
113 stehr 255 // read first line
114     if((line = fpdb.readLine()) != null ) {
115     linecount = 1;
116 duarte 219 // HEADER
117 stehr 255 p = Pattern.compile("^HEADER");
118     m = p.matcher(line);
119 duarte 207 if (m.find()){
120     Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
121     Matcher mh = ph.matcher(line);
122     if (mh.find()) {
123     pdbCode=mh.group(1).toLowerCase();
124     }
125 stehr 255 } else { // header not found
126     // check whether this is a Casp prediction file
127     p = Pattern.compile("^PFRMAT TS");
128     m = p.matcher(line);
129     if(m.find()) {
130     // ok, it is
131     pdbCode = "CASP";
132     } else {
133     // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
134     throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format");
135     }
136 duarte 207 }
137 stehr 255 } else {
138     throw new PdbfileFormatError("The file "+pdbfile+" is empty.");
139     }
140     // read further lines
141     while ((line = fpdb.readLine() ) != null ) {
142     linecount++;
143 duarte 258 // SEQRES
144     //SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE
145     p = Pattern.compile("^SEQRES.{5}"+chainCodeStr);
146     m = p.matcher(line);
147     if (m.find()){
148     for (int i=19;i<=67;i+=4) {
149     if (!line.substring(i, i+3).equals(" ")) {
150     sequence+= AA.threeletter2oneletter(line.substring(i, i+3));
151     }
152     }
153     }
154 duarte 219 // SECONDARY STRUCTURE
155     // helix
156     //HELIX 1 1 LYS A 17 LEU A 26 1
157     // helix ser beg res ser end res ser
158     p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
159     m = p.matcher(line);
160     if (m.find()){
161     int serial = Integer.valueOf(m.group(1).trim());
162     int beg = Integer.valueOf(m.group(2).trim());
163     int end = Integer.valueOf(m.group(3).trim());
164     String ssId = "H"+serial;
165 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
166 duarte 219 for (int i=beg;i<=end;i++){
167     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
168 stehr 250 //System.err.println("Inconsistency in secondary structure assignment. " +
169     // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
170 duarte 219 }
171     resser2secstruct.put(i,ssId);
172     }
173     }
174     // sheet
175     //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
176     // strand ser sheet id beg res ser end res ser
177     p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
178     m = p.matcher(line);
179     if (m.find()){
180     int strandSerial = Integer.valueOf(m.group(1).trim());
181     String sheetId = m.group(2).trim();
182     int beg = Integer.valueOf(m.group(3).trim());
183     int end = Integer.valueOf(m.group(4).trim());
184     String ssId = "S"+sheetId+strandSerial;
185 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
186 duarte 219 for (int i=beg;i<=end;i++){
187     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
188 stehr 250 //System.err.println("Inconsistency in secondary structure assignment. " +
189     // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
190 duarte 219 }
191     resser2secstruct.put(i,ssId);
192     }
193     }
194     // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
195     // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
196     // turn
197     //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
198     // turn ser beg res ser end res ser
199     p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
200     m = p.matcher(line);
201     if (m.find()){
202     int serial = Integer.valueOf(m.group(1).trim());
203     int beg = Integer.valueOf(m.group(2).trim());
204     int end = Integer.valueOf(m.group(3).trim());
205     String ssId = "T"+serial;
206 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
207 duarte 219 for (int i=beg;i<=end;i++){
208     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
209 stehr 250 //System.err.println("Inconsistency in secondary structure assignment. " +
210     // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
211 duarte 219 }
212     resser2secstruct.put(i,ssId);
213     }
214     }
215     // MODEL
216 duarte 207 p = Pattern.compile("^MODEL\\s+(\\d+)");
217     m = p.matcher(line);
218     if (m.find()){
219     thismodel=Integer.parseInt(m.group(1));
220     }
221     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
222 duarte 219 // ATOM
223 duarte 207 p = Pattern.compile("^ATOM");
224     m = p.matcher(line);
225     if (m.find()){
226 duarte 208 // serial atom res_type chain res_ser x y z
227     Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
228 duarte 207 Matcher ml = pl.matcher(line);
229     if (ml.find()) {
230     empty=false;
231     int atomserial=Integer.parseInt(ml.group(1).trim());
232     String atom = ml.group(2).trim();
233     String res_type = ml.group(3).trim();
234     int res_serial = Integer.parseInt(ml.group(4).trim());
235     double x = Double.parseDouble(ml.group(5).trim());
236     double y = Double.parseDouble(ml.group(6).trim());
237     double z = Double.parseDouble(ml.group(7).trim());
238 duarte 226 Point3d coords = new Point3d(x,y,z);
239 duarte 207 ArrayList<String> aalist=AA.aas();
240     if (aalist.contains(res_type)) {
241     atomser2coord.put(atomserial, coords);
242     atomser2resser.put(atomserial, res_serial);
243     resser2restype.put(res_serial, res_type);
244     ArrayList<String> atomlist = aas2atoms.get(res_type);
245     if (atomlist.contains(atom)){
246     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
247     }
248     }
249     }
250     }
251     }
252     fpdb.close();
253 stehr 215 if (empty) {
254     //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
255     throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
256     }
257 duarte 258 if (sequence.equals("")){
258     // if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap
259     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
260     ArrayList<Integer> ressers = new ArrayList<Integer>();
261     for (int resser:resser2restype.keySet()) {
262     ressers.add(resser);
263     }
264     Collections.sort(ressers);
265     for (int resser:ressers){
266     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
267     sequence += oneletter;
268     }
269 duarte 207 }
270     }
271 duarte 219
272 duarte 207 }