ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/PdbfilePdb.java
Revision: 222
Committed: Wed Jul 4 15:39:01 2007 UTC (17 years, 2 months ago) by duarte
File size: 10132 byte(s)
Log Message:
New secstruct2resinterval TreeMap to store secondary structure elements as a map of ss string ids to intervals
New class Interval
Line User Rev File contents
1 duarte 207 package proteinstructure;
2    
3     import java.io.BufferedReader;
4     import java.io.File;
5     import java.io.FileNotFoundException;
6     import java.io.FileReader;
7     import java.io.IOException;
8     import java.util.ArrayList;
9     import java.util.Collections;
10     import java.util.HashMap;
11 duarte 222 import java.util.TreeMap;
12 duarte 207 import java.util.regex.Matcher;
13     import java.util.regex.Pattern;
14    
15     public class PdbfilePdb extends Pdb {
16    
17 duarte 208 private static final String UNKNOWN_STRING ="Unknown";
18     private static final String NULL_chainCode = "A";
19    
20 duarte 207 private String pdbfile;
21    
22     /**
23     * Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL
24     * @param pdbfile
25     * @param pdbChainCode
26     * @throws FileNotFoundException
27     * @throws IOException
28 duarte 208 * @throws PdbfileFormatError
29 stehr 215 * @throws PdbChainCodeNotFoundError
30 duarte 207 */
31 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
32 duarte 207 this(pdbfile,pdbChainCode,DEFAULT_MODEL);
33     }
34    
35     /**
36     * Constructs Pdb object reading from pdb file given pdb chain code and model serial
37     * @param pdbfile
38     * @param pdbChainCode
39     * @param model_serial
40     * @throws FileNotFoundException
41     * @throws IOException
42 duarte 208 * @throws PdbfileFormatError
43 stehr 215 * @throws PdbChainCodeNotFoundError
44 duarte 207 */
45 stehr 215 public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
46 duarte 207 this.pdbfile = pdbfile;
47     this.model=model_serial;
48 duarte 208 this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file
49 stehr 217 this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case
50 duarte 208 // we set chainCode to pdbChainCode except for case NULL where we use "A"
51     this.chainCode=pdbChainCode;
52     if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode;
53    
54     this.sequence=""; // we initialise it to empty string, then is set inread_pdb_data_from_file
55    
56 duarte 222 // we initialise the resser2secstruct and secstruct2resinterval Maps to empty, if no sec structure info is found then it remains empty
57 duarte 219 this.resser2secstruct = new HashMap<Integer, String>();
58 duarte 222 this.secstruct2resinterval = new TreeMap<String, Interval>();
59 duarte 219
60 duarte 207 read_pdb_data_from_file();
61 duarte 208
62     // when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know
63     //TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used
64     this.resser2pdbresser = new HashMap<Integer, String>();
65     this.pdbresser2resser = new HashMap<String, Integer>();
66     for (int resser:resser2restype.keySet()){
67     resser2pdbresser.put(resser, String.valueOf(resser));
68     pdbresser2resser.put(String.valueOf(resser), resser);
69     }
70 duarte 219
71 duarte 207 }
72    
73    
74    
75     /**
76 duarte 219 * To read the pdb data (atom coordinates, residue serials, atom serials) from file.
77     * chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A'
78 duarte 207 * pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found
79     * sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only)
80 duarte 219 * No insertion codes are parsed or taken into account at the moment. Thus files with
81     * insertion codes will be incorrectly read
82 duarte 207 * @param pdbfile
83     * @throws FileNotFoundException
84     * @throws IOException
85 duarte 208 * @throws PdbfileFormatError
86 stehr 215 * @throws PdbChainCodeNotFoundError
87 duarte 207 */
88 stehr 215 private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError {
89 duarte 207 resser_atom2atomserial = new HashMap<String,Integer>();
90     resser2restype = new HashMap<Integer,String>();
91     atomser2coord = new HashMap<Integer,Double[]>();
92     atomser2resser = new HashMap<Integer,Integer>();
93     boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model
94 duarte 208 // we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files)
95     String chainCodeStr=pdbChainCode;
96     if (pdbChainCode.equals("NULL")) chainCodeStr=" ";
97    
98 duarte 207 int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines
99     BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile)));
100 duarte 208 int linecount=0;
101 duarte 207 String line;
102     while ((line = fpdb.readLine() ) != null ) {
103 duarte 208 linecount++;
104 duarte 219 // HEADER
105 duarte 207 Pattern p = Pattern.compile("^HEADER");
106     Matcher m = p.matcher(line);
107     if (m.find()){
108     Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})");
109     Matcher mh = ph.matcher(line);
110     if (mh.find()) {
111     pdbCode=mh.group(1).toLowerCase();
112     }
113 duarte 208 } else if (linecount==1) { // header not found and we are in line 1
114     // a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception
115     throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format");
116 duarte 207 }
117 duarte 219 // SECONDARY STRUCTURE
118     // helix
119     //HELIX 1 1 LYS A 17 LEU A 26 1
120     // helix ser beg res ser end res ser
121     p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)");
122     m = p.matcher(line);
123     if (m.find()){
124     int serial = Integer.valueOf(m.group(1).trim());
125     int beg = Integer.valueOf(m.group(2).trim());
126     int end = Integer.valueOf(m.group(3).trim());
127     String ssId = "H"+serial;
128 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
129 duarte 219 for (int i=beg;i<=end;i++){
130     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
131     System.err.println("Inconsistency in secondary structure assignment. " +
132     "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
133     }
134     resser2secstruct.put(i,ssId);
135     }
136     }
137     // sheet
138     //SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107
139     // strand ser sheet id beg res ser end res ser
140     p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
141     m = p.matcher(line);
142     if (m.find()){
143     int strandSerial = Integer.valueOf(m.group(1).trim());
144     String sheetId = m.group(2).trim();
145     int beg = Integer.valueOf(m.group(3).trim());
146     int end = Integer.valueOf(m.group(4).trim());
147     String ssId = "S"+sheetId+strandSerial;
148 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
149 duarte 219 for (int i=beg;i<=end;i++){
150     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
151     System.err.println("Inconsistency in secondary structure assignment. " +
152     "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
153     }
154     resser2secstruct.put(i,ssId);
155     }
156     }
157     // we've stored the sec structure info in the strands2begEnd and sheets2strands maps.
158     // the assignment to resser2secstruct is done when we reach the ATOM lines, see below
159     // turn
160     //TURN 1 S1A GLY A 16 GLN A 18 SURFACE
161     // turn ser beg res ser end res ser
162     p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)");
163     m = p.matcher(line);
164     if (m.find()){
165     int serial = Integer.valueOf(m.group(1).trim());
166     int beg = Integer.valueOf(m.group(2).trim());
167     int end = Integer.valueOf(m.group(3).trim());
168     String ssId = "T"+serial;
169 duarte 222 secstruct2resinterval.put(ssId, new Interval(beg,end));
170 duarte 219 for (int i=beg;i<=end;i++){
171     if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it
172     System.err.println("Inconsistency in secondary structure assignment. " +
173     "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId);
174     }
175     resser2secstruct.put(i,ssId);
176     }
177     }
178     // MODEL
179 duarte 207 p = Pattern.compile("^MODEL\\s+(\\d+)");
180     m = p.matcher(line);
181     if (m.find()){
182     thismodel=Integer.parseInt(m.group(1));
183     }
184     if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model
185 duarte 219 // ATOM
186 duarte 207 p = Pattern.compile("^ATOM");
187     m = p.matcher(line);
188     if (m.find()){
189 duarte 208 // serial atom res_type chain res_ser x y z
190     Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE);
191 duarte 207 Matcher ml = pl.matcher(line);
192     if (ml.find()) {
193     empty=false;
194     int atomserial=Integer.parseInt(ml.group(1).trim());
195     String atom = ml.group(2).trim();
196     String res_type = ml.group(3).trim();
197     int res_serial = Integer.parseInt(ml.group(4).trim());
198     double x = Double.parseDouble(ml.group(5).trim());
199     double y = Double.parseDouble(ml.group(6).trim());
200     double z = Double.parseDouble(ml.group(7).trim());
201     Double[] coords = {x, y, z};
202     ArrayList<String> aalist=AA.aas();
203     if (aalist.contains(res_type)) {
204     atomser2coord.put(atomserial, coords);
205     atomser2resser.put(atomserial, res_serial);
206     resser2restype.put(res_serial, res_type);
207     ArrayList<String> atomlist = aas2atoms.get(res_type);
208     if (atomlist.contains(atom)){
209     resser_atom2atomserial.put(res_serial+"_"+atom, atomserial);
210     }
211     }
212     }
213     }
214     }
215     fpdb.close();
216 stehr 215 if (empty) {
217     //System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model);
218     throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model);
219     }
220 duarte 207 // now we read the sequence from the resser2restype HashMap
221     // NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here!
222     ArrayList<Integer> ressers = new ArrayList<Integer>();
223     for (int resser:resser2restype.keySet()) {
224     ressers.add(resser);
225     }
226     Collections.sort(ressers);
227     for (int resser:ressers){
228     String oneletter = AA.threeletter2oneletter(resser2restype.get(resser));
229     sequence += oneletter;
230     }
231     }
232 duarte 219
233 duarte 207 }