1 |
duarte |
207 |
package proteinstructure; |
2 |
|
|
|
3 |
|
|
import java.io.BufferedReader; |
4 |
|
|
import java.io.File; |
5 |
|
|
import java.io.FileNotFoundException; |
6 |
|
|
import java.io.FileReader; |
7 |
|
|
import java.io.IOException; |
8 |
|
|
import java.util.ArrayList; |
9 |
|
|
import java.util.Collections; |
10 |
|
|
import java.util.HashMap; |
11 |
duarte |
222 |
import java.util.TreeMap; |
12 |
duarte |
207 |
import java.util.regex.Matcher; |
13 |
|
|
import java.util.regex.Pattern; |
14 |
|
|
|
15 |
duarte |
226 |
import javax.vecmath.Point3d; |
16 |
|
|
|
17 |
duarte |
207 |
public class PdbfilePdb extends Pdb { |
18 |
|
|
|
19 |
duarte |
208 |
private static final String UNKNOWN_STRING ="Unknown"; |
20 |
|
|
private static final String NULL_chainCode = "A"; |
21 |
|
|
|
22 |
duarte |
207 |
private String pdbfile; |
23 |
|
|
|
24 |
|
|
/** |
25 |
|
|
* Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL |
26 |
|
|
* @param pdbfile |
27 |
|
|
* @param pdbChainCode |
28 |
|
|
* @throws FileNotFoundException |
29 |
|
|
* @throws IOException |
30 |
duarte |
208 |
* @throws PdbfileFormatError |
31 |
stehr |
215 |
* @throws PdbChainCodeNotFoundError |
32 |
duarte |
207 |
*/ |
33 |
stehr |
215 |
public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
34 |
duarte |
207 |
this(pdbfile,pdbChainCode,DEFAULT_MODEL); |
35 |
|
|
} |
36 |
|
|
|
37 |
|
|
/** |
38 |
|
|
* Constructs Pdb object reading from pdb file given pdb chain code and model serial |
39 |
|
|
* @param pdbfile |
40 |
|
|
* @param pdbChainCode |
41 |
|
|
* @param model_serial |
42 |
|
|
* @throws FileNotFoundException |
43 |
|
|
* @throws IOException |
44 |
duarte |
208 |
* @throws PdbfileFormatError |
45 |
stehr |
215 |
* @throws PdbChainCodeNotFoundError |
46 |
duarte |
207 |
*/ |
47 |
stehr |
215 |
public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
48 |
duarte |
207 |
this.pdbfile = pdbfile; |
49 |
|
|
this.model=model_serial; |
50 |
duarte |
208 |
this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file |
51 |
stehr |
217 |
this.pdbChainCode=pdbChainCode.toUpperCase(); // our convention: chain codes are upper case |
52 |
duarte |
208 |
// we set chainCode to pdbChainCode except for case NULL where we use "A" |
53 |
|
|
this.chainCode=pdbChainCode; |
54 |
|
|
if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode; |
55 |
|
|
|
56 |
|
|
this.sequence=""; // we initialise it to empty string, then is set inread_pdb_data_from_file |
57 |
|
|
|
58 |
duarte |
222 |
// we initialise the resser2secstruct and secstruct2resinterval Maps to empty, if no sec structure info is found then it remains empty |
59 |
duarte |
219 |
this.resser2secstruct = new HashMap<Integer, String>(); |
60 |
duarte |
222 |
this.secstruct2resinterval = new TreeMap<String, Interval>(); |
61 |
duarte |
219 |
|
62 |
duarte |
207 |
read_pdb_data_from_file(); |
63 |
duarte |
208 |
|
64 |
|
|
// when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know |
65 |
|
|
//TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used |
66 |
|
|
this.resser2pdbresser = new HashMap<Integer, String>(); |
67 |
|
|
this.pdbresser2resser = new HashMap<String, Integer>(); |
68 |
|
|
for (int resser:resser2restype.keySet()){ |
69 |
|
|
resser2pdbresser.put(resser, String.valueOf(resser)); |
70 |
|
|
pdbresser2resser.put(String.valueOf(resser), resser); |
71 |
|
|
} |
72 |
duarte |
219 |
|
73 |
duarte |
237 |
// initialising atomser2atom from resser_atom2atomserial |
74 |
|
|
atomser2atom = new HashMap<Integer, String>(); |
75 |
|
|
for (String resser_atom:resser_atom2atomserial.keySet()){ |
76 |
|
|
int atomserial = resser_atom2atomserial.get(resser_atom); |
77 |
|
|
String atom = resser_atom.split("_")[1]; |
78 |
|
|
atomser2atom.put(atomserial,atom); |
79 |
|
|
} |
80 |
duarte |
207 |
} |
81 |
|
|
|
82 |
|
|
|
83 |
|
|
|
84 |
|
|
/** |
85 |
duarte |
219 |
* To read the pdb data (atom coordinates, residue serials, atom serials) from file. |
86 |
|
|
* chainCode gets set to same as pdbChainCode, except if input chain code NULL then chainCode will be 'A' |
87 |
duarte |
207 |
* pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found |
88 |
|
|
* sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only) |
89 |
duarte |
219 |
* No insertion codes are parsed or taken into account at the moment. Thus files with |
90 |
|
|
* insertion codes will be incorrectly read |
91 |
duarte |
207 |
* @param pdbfile |
92 |
|
|
* @throws FileNotFoundException |
93 |
|
|
* @throws IOException |
94 |
duarte |
208 |
* @throws PdbfileFormatError |
95 |
stehr |
215 |
* @throws PdbChainCodeNotFoundError |
96 |
duarte |
207 |
*/ |
97 |
stehr |
215 |
private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
98 |
duarte |
207 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
99 |
|
|
resser2restype = new HashMap<Integer,String>(); |
100 |
duarte |
226 |
atomser2coord = new HashMap<Integer,Point3d>(); |
101 |
duarte |
207 |
atomser2resser = new HashMap<Integer,Integer>(); |
102 |
stehr |
255 |
Pattern p; |
103 |
|
|
Matcher m; |
104 |
duarte |
207 |
boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model |
105 |
duarte |
208 |
// we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files) |
106 |
|
|
String chainCodeStr=pdbChainCode; |
107 |
|
|
if (pdbChainCode.equals("NULL")) chainCodeStr=" "; |
108 |
|
|
|
109 |
duarte |
207 |
int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines |
110 |
|
|
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
111 |
duarte |
208 |
int linecount=0; |
112 |
duarte |
207 |
String line; |
113 |
stehr |
255 |
// read first line |
114 |
|
|
if((line = fpdb.readLine()) != null ) { |
115 |
|
|
linecount = 1; |
116 |
duarte |
219 |
// HEADER |
117 |
stehr |
255 |
p = Pattern.compile("^HEADER"); |
118 |
|
|
m = p.matcher(line); |
119 |
duarte |
207 |
if (m.find()){ |
120 |
|
|
Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})"); |
121 |
|
|
Matcher mh = ph.matcher(line); |
122 |
|
|
if (mh.find()) { |
123 |
|
|
pdbCode=mh.group(1).toLowerCase(); |
124 |
|
|
} |
125 |
stehr |
255 |
} else { // header not found |
126 |
|
|
// check whether this is a Casp prediction file |
127 |
|
|
p = Pattern.compile("^PFRMAT TS"); |
128 |
|
|
m = p.matcher(line); |
129 |
|
|
if(m.find()) { |
130 |
|
|
// ok, it is |
131 |
|
|
pdbCode = "CASP"; |
132 |
|
|
} else { |
133 |
|
|
// a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception |
134 |
|
|
throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format"); |
135 |
|
|
} |
136 |
duarte |
207 |
} |
137 |
stehr |
255 |
} else { |
138 |
|
|
throw new PdbfileFormatError("The file "+pdbfile+" is empty."); |
139 |
|
|
} |
140 |
|
|
// read further lines |
141 |
|
|
while ((line = fpdb.readLine() ) != null ) { |
142 |
|
|
linecount++; |
143 |
duarte |
219 |
// SECONDARY STRUCTURE |
144 |
|
|
// helix |
145 |
|
|
//HELIX 1 1 LYS A 17 LEU A 26 1 |
146 |
|
|
// helix ser beg res ser end res ser |
147 |
|
|
p = Pattern.compile("^HELIX..(...).{9}"+chainCodeStr+".(....).{6}"+chainCodeStr+".(....)"); |
148 |
|
|
m = p.matcher(line); |
149 |
|
|
if (m.find()){ |
150 |
|
|
int serial = Integer.valueOf(m.group(1).trim()); |
151 |
|
|
int beg = Integer.valueOf(m.group(2).trim()); |
152 |
|
|
int end = Integer.valueOf(m.group(3).trim()); |
153 |
|
|
String ssId = "H"+serial; |
154 |
duarte |
222 |
secstruct2resinterval.put(ssId, new Interval(beg,end)); |
155 |
duarte |
219 |
for (int i=beg;i<=end;i++){ |
156 |
|
|
if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it |
157 |
stehr |
250 |
//System.err.println("Inconsistency in secondary structure assignment. " + |
158 |
|
|
// "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId); |
159 |
duarte |
219 |
} |
160 |
|
|
resser2secstruct.put(i,ssId); |
161 |
|
|
} |
162 |
|
|
} |
163 |
|
|
// sheet |
164 |
|
|
//SHEET 2 A 5 ILE A 96 THR A 99 -1 N LYS A 98 O THR A 107 |
165 |
|
|
// strand ser sheet id beg res ser end res ser |
166 |
|
|
p = Pattern.compile("^SHEET..(...).(...).{7}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)"); |
167 |
|
|
m = p.matcher(line); |
168 |
|
|
if (m.find()){ |
169 |
|
|
int strandSerial = Integer.valueOf(m.group(1).trim()); |
170 |
|
|
String sheetId = m.group(2).trim(); |
171 |
|
|
int beg = Integer.valueOf(m.group(3).trim()); |
172 |
|
|
int end = Integer.valueOf(m.group(4).trim()); |
173 |
|
|
String ssId = "S"+sheetId+strandSerial; |
174 |
duarte |
222 |
secstruct2resinterval.put(ssId, new Interval(beg,end)); |
175 |
duarte |
219 |
for (int i=beg;i<=end;i++){ |
176 |
|
|
if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it |
177 |
stehr |
250 |
//System.err.println("Inconsistency in secondary structure assignment. " + |
178 |
|
|
// "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId); |
179 |
duarte |
219 |
} |
180 |
|
|
resser2secstruct.put(i,ssId); |
181 |
|
|
} |
182 |
|
|
} |
183 |
|
|
// we've stored the sec structure info in the strands2begEnd and sheets2strands maps. |
184 |
|
|
// the assignment to resser2secstruct is done when we reach the ATOM lines, see below |
185 |
|
|
// turn |
186 |
|
|
//TURN 1 S1A GLY A 16 GLN A 18 SURFACE |
187 |
|
|
// turn ser beg res ser end res ser |
188 |
|
|
p = Pattern.compile("^TURN...(...).{9}"+chainCodeStr+"(....).{6}"+chainCodeStr+"(....)"); |
189 |
|
|
m = p.matcher(line); |
190 |
|
|
if (m.find()){ |
191 |
|
|
int serial = Integer.valueOf(m.group(1).trim()); |
192 |
|
|
int beg = Integer.valueOf(m.group(2).trim()); |
193 |
|
|
int end = Integer.valueOf(m.group(3).trim()); |
194 |
|
|
String ssId = "T"+serial; |
195 |
duarte |
222 |
secstruct2resinterval.put(ssId, new Interval(beg,end)); |
196 |
duarte |
219 |
for (int i=beg;i<=end;i++){ |
197 |
|
|
if (resser2secstruct.containsKey(i)){// if already assigned we print a warning and then assign it |
198 |
stehr |
250 |
//System.err.println("Inconsistency in secondary structure assignment. " + |
199 |
|
|
// "Residue "+i+" is getting reassigned from "+resser2secstruct.get(i)+" to "+ssId); |
200 |
duarte |
219 |
} |
201 |
|
|
resser2secstruct.put(i,ssId); |
202 |
|
|
} |
203 |
|
|
} |
204 |
|
|
// MODEL |
205 |
duarte |
207 |
p = Pattern.compile("^MODEL\\s+(\\d+)"); |
206 |
|
|
m = p.matcher(line); |
207 |
|
|
if (m.find()){ |
208 |
|
|
thismodel=Integer.parseInt(m.group(1)); |
209 |
|
|
} |
210 |
|
|
if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model |
211 |
duarte |
219 |
// ATOM |
212 |
duarte |
207 |
p = Pattern.compile("^ATOM"); |
213 |
|
|
m = p.matcher(line); |
214 |
|
|
if (m.find()){ |
215 |
duarte |
208 |
// serial atom res_type chain res_ser x y z |
216 |
|
|
Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE); |
217 |
duarte |
207 |
Matcher ml = pl.matcher(line); |
218 |
|
|
if (ml.find()) { |
219 |
|
|
empty=false; |
220 |
|
|
int atomserial=Integer.parseInt(ml.group(1).trim()); |
221 |
|
|
String atom = ml.group(2).trim(); |
222 |
|
|
String res_type = ml.group(3).trim(); |
223 |
|
|
int res_serial = Integer.parseInt(ml.group(4).trim()); |
224 |
|
|
double x = Double.parseDouble(ml.group(5).trim()); |
225 |
|
|
double y = Double.parseDouble(ml.group(6).trim()); |
226 |
|
|
double z = Double.parseDouble(ml.group(7).trim()); |
227 |
duarte |
226 |
Point3d coords = new Point3d(x,y,z); |
228 |
duarte |
207 |
ArrayList<String> aalist=AA.aas(); |
229 |
|
|
if (aalist.contains(res_type)) { |
230 |
|
|
atomser2coord.put(atomserial, coords); |
231 |
|
|
atomser2resser.put(atomserial, res_serial); |
232 |
|
|
resser2restype.put(res_serial, res_type); |
233 |
|
|
ArrayList<String> atomlist = aas2atoms.get(res_type); |
234 |
|
|
if (atomlist.contains(atom)){ |
235 |
|
|
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
236 |
|
|
} |
237 |
|
|
} |
238 |
|
|
} |
239 |
|
|
} |
240 |
|
|
} |
241 |
|
|
fpdb.close(); |
242 |
stehr |
215 |
if (empty) { |
243 |
|
|
//System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model); |
244 |
|
|
throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model); |
245 |
|
|
} |
246 |
duarte |
207 |
// now we read the sequence from the resser2restype HashMap |
247 |
|
|
// NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here! |
248 |
|
|
ArrayList<Integer> ressers = new ArrayList<Integer>(); |
249 |
|
|
for (int resser:resser2restype.keySet()) { |
250 |
|
|
ressers.add(resser); |
251 |
|
|
} |
252 |
|
|
Collections.sort(ressers); |
253 |
|
|
for (int resser:ressers){ |
254 |
|
|
String oneletter = AA.threeletter2oneletter(resser2restype.get(resser)); |
255 |
|
|
sequence += oneletter; |
256 |
|
|
} |
257 |
|
|
} |
258 |
duarte |
219 |
|
259 |
duarte |
207 |
} |