ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 261
Committed: Mon Aug 13 08:53:11 2007 UTC (17 years, 1 month ago) by duarte
File size: 11492 byte(s)
Log Message:
Fixed reading of SEQRES, was putting nulls for non-standard aas
Line User Rev File contents
1 duarte 207 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5     import java.io.FileNotFoundException;
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11 duarte 222 import java.util.TreeMap;
12 duarte 207 import java.util.regex.Matcher;
13     import java.util.regex.Pattern;
14    
15 duarte 226 import javax.vecmath.Point3d;
16    
17 duarte 207 public class PdbfilePdb extends Pdb {
18    
19 duarte 208 private static final String UNKNOWN_STRING ="Unknown";
20     private static final String NULL_chainCode = "A";
21    
22 duarte 207 private String pdbfile;
23    
24     /**
25     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
26     * @param pdbfile
27     * @param pdbChainCode
28     * @throws FileNotFoundException
29     * @throws IOException
30 duarte 208 * @throws PdbfileFormatError
31 stehr 215 * @throws PdbChainCodeNotFoundError
32 duarte 207 */
33 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
34 duarte 207 this(pdbfile,pdbChainCode,DEFAULT_MODEL);
35     }
36    
37     /**
38     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
39     * @param pdbfile
40     * @param pdbChainCode
41     * @param model_serial
42     * @throws FileNotFoundException
43     * @throws IOException
44 duarte 208 * @throws PdbfileFormatError
45 stehr 215 * @throws PdbChainCodeNotFoundError
46 duarte 207 */
47 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
48 duarte 207 this.pdbfile = pdbfile;
49     this.model=model_serial;
50 duarte 208 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
51 stehr 217 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
52 duarte 208 // we set chainCode to pdbChainCode except for case NULL where we use "A"
53     this.chainCode=pdbChainCode;
54     if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode;
55    
56 duarte 258 this.sequence=""; // we initialise it to empty string, then is set in read_pdb_data_from_file
57 duarte 208
58 duarte 222 // we initialise the resser2secstruct and secstruct2resinterval Maps to empty, if no sec structure info is found then it remains empty
59 duarte 219 this.resser2secstruct = new HashMap<Integer, String>();
60 duarte 222 this.secstruct2resinterval = new TreeMap<String, Interval>();
61 duarte 219
62 duarte 207 read_pdb_data_from_file();
63 stehr 259 if(!resser2secstruct.isEmpty()) {
64     hasSecondaryStructure = true;
65     secondaryStructureSource = "Author";
66     }
67 duarte 208
68     // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
69     //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
70     this.resser2pdbresser = new HashMap<Integer, String>();
71     this.pdbresser2resser = new HashMap<String, Integer>();
72     for (int resser:resser2restype.keySet()){
73     resser2pdbresser.put(resser, String.valueOf(resser));
74     pdbresser2resser.put(String.valueOf(resser), resser);
75     }
76 duarte 219
77 duarte 237 // initialising atomser2atom from resser_atom2atomserial
78     atomser2atom = new HashMap<Integer, String>();
79     for (String resser_atom:resser_atom2atomserial.keySet()){
80     int atomserial = resser_atom2atomserial.get(resser_atom);
81     String atom = resser_atom.split("_")[1];
82     atomser2atom.put(atomserial,atom);
83     }
84 duarte 207 }
85    
86    
87    
88     /**
89 duarte 219 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
90     * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
91 duarte 207 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
92 duarte 258 * sequence gets set to the sequence read from ATOM lines (i.e. observed residues only)
93 duarte 219 * No insertion codes are parsed or taken into account at the moment. Thus files with
94     * insertion codes will be incorrectly read
95 duarte 207 * @param pdbfile
96     * @throws FileNotFoundException
97     * @throws IOException
98 duarte 208 * @throws PdbfileFormatError
99 stehr 215 * @throws PdbChainCodeNotFoundError
100 duarte 207 */
101 stehr 215 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
102 duarte 207 resser_atom2atomserial = new HashMap<String,Integer>();
103     resser2restype = new HashMap<Integer,String>();
104 duarte 226 atomser2coord = new HashMap<Integer,Point3d>();
105 duarte 207 atomser2resser = new HashMap<Integer,Integer>();
106 stehr 255 Pattern p;
107     Matcher m;
108 duarte 207 boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
109 duarte 208 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
110     String chainCodeStr=pdbChainCode;
111     if (pdbChainCode.equals("NULL")) chainCodeStr=" ";
112    
113 duarte 207 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
114     BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
115 duarte 208 int linecount=0;
116 duarte 207 String line;
117 stehr 255 // read first line
118     if((line = fpdb.readLine()) != null ) {
119     linecount = 1;
120 duarte 219 // HEADER
121 stehr 255 p = Pattern.compile("^HEADER");
122     m = p.matcher(line);
123 duarte 207 if (m.find()){
124     Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
125     Matcher mh = ph.matcher(line);
126     if (mh.find()) {
127     pdbCode=mh.group(1).toLowerCase();
128     }
129 stehr 255 } else { // header not found
130     // check whether this is a Casp prediction file
131     p = Pattern.compile("^PFRMAT TS");
132     m = p.matcher(line);
133     if(m.find()) {
134     // ok, it is
135     pdbCode = "CASP";
136     } else {
137     // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
138     throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format");
139     }
140 duarte 207 }
141 stehr 255 } else {
142     throw new PdbfileFormatError("The file "+pdbfile+" is empty.");
143     }
144     // read further lines
145     while ((line = fpdb.readLine() ) != null ) {
146     linecount++;
147 duarte 258 // SEQRES
148     //SEQRES 1 A 348 VAL ASN ILE LYS THR ASN PRO PHE LYS ALA VAL SER PHE
149     p = Pattern.compile("^SEQRES.{5}"+chainCodeStr);
150     m = p.matcher(line);
151     if (m.find()){
152     for (int i=19;i<=67;i+=4) {
153     if (!line.substring(i, i+3).equals(" ")) {
154 duarte 261 if (AA.threeletter2oneletter(line.substring(i, i+3))!=null) { // for non-standard aas
155     sequence+= AA.threeletter2oneletter(line.substring(i, i+3));
156     }
157 duarte 258 }
158     }
159     }
160 duarte 219 // SECONDARY STRUCTURE
161     // helix
162     //HELIX 1 1 LYS A 17 LEU A 26 1
163     // helix ser beg res ser end res ser
164     p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
165     m = p.matcher(line);
166     if (m.find()){
167     int serial = Integer.valueOf(m.group(1).trim());
168     int beg = Integer.valueOf(m.group(2).trim());
169     int end = Integer.valueOf(m.group(3).trim());
170     String ssId = "H"+serial;
171 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
172 duarte 219 for (int i=beg;i<=end;i++){
173     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
174 stehr 250 //System.err.println("Inconsistency in secondary structure assignment. " +
175     // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
176 duarte 219 }
177     resser2secstruct.put(i,ssId);
178     }
179     }
180     // sheet
181     //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
182     // strand ser sheet id beg res ser end res ser
183     p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
184     m = p.matcher(line);
185     if (m.find()){
186     int strandSerial = Integer.valueOf(m.group(1).trim());
187     String sheetId = m.group(2).trim();
188     int beg = Integer.valueOf(m.group(3).trim());
189     int end = Integer.valueOf(m.group(4).trim());
190     String ssId = "S"+sheetId+strandSerial;
191 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
192 duarte 219 for (int i=beg;i<=end;i++){
193     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
194 stehr 250 //System.err.println("Inconsistency in secondary structure assignment. " +
195     // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
196 duarte 219 }
197     resser2secstruct.put(i,ssId);
198     }
199     }
200     // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
201     // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
202     // turn
203     //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
204     // turn ser beg res ser end res ser
205     p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
206     m = p.matcher(line);
207     if (m.find()){
208     int serial = Integer.valueOf(m.group(1).trim());
209     int beg = Integer.valueOf(m.group(2).trim());
210     int end = Integer.valueOf(m.group(3).trim());
211     String ssId = "T"+serial;
212 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
213 duarte 219 for (int i=beg;i<=end;i++){
214     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
215 stehr 250 //System.err.println("Inconsistency in secondary structure assignment. " +
216     // "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
217 duarte 219 }
218     resser2secstruct.put(i,ssId);
219     }
220     }
221     // MODEL
222 duarte 207 p = Pattern.compile("^MODEL\\s+(\\d+)");
223     m = p.matcher(line);
224     if (m.find()){
225     thismodel=Integer.parseInt(m.group(1));
226     }
227     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
228 duarte 219 // ATOM
229 duarte 207 p = Pattern.compile("^ATOM");
230     m = p.matcher(line);
231     if (m.find()){
232 duarte 208 // serial atom res_type chain res_ser x y z
233     Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
234 duarte 207 Matcher ml = pl.matcher(line);
235     if (ml.find()) {
236     empty=false;
237     int atomserial=Integer.parseInt(ml.group(1).trim());
238     String atom = ml.group(2).trim();
239     String res_type = ml.group(3).trim();
240     int res_serial = Integer.parseInt(ml.group(4).trim());
241     double x = Double.parseDouble(ml.group(5).trim());
242     double y = Double.parseDouble(ml.group(6).trim());
243     double z = Double.parseDouble(ml.group(7).trim());
244 duarte 226 Point3d coords = new Point3d(x,y,z);
245 duarte 207 ArrayList<String> aalist=AA.aas();
246     if (aalist.contains(res_type)) {
247     atomser2coord.put(atomserial, coords);
248     atomser2resser.put(atomserial, res_serial);
249     resser2restype.put(res_serial, res_type);
250     ArrayList<String> atomlist = aas2atoms.get(res_type);
251     if (atomlist.contains(atom)){
252     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
253     }
254     }
255     }
256     }
257     }
258     fpdb.close();
259 stehr 215 if (empty) {
260     //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
261     throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
262     }
263 duarte 258 if (sequence.equals("")){
264     // if we couldn't read anything from SEQRES then we read it from the resser2restype HashMap
265     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
266     ArrayList<Integer> ressers = new ArrayList<Integer>();
267     for (int resser:resser2restype.keySet()) {
268     ressers.add(resser);
269     }
270     Collections.sort(ressers);
271     for (int resser:ressers){
272     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
273     sequence += oneletter;
274     }
275 duarte 207 }
276     }
277 duarte 219
278 duarte 207 }