1 |
duarte |
123 |
package proteinstructure; |
2 |
|
|
|
3 |
|
|
import java.io.FileOutputStream; |
4 |
|
|
import java.io.PrintStream; |
5 |
|
|
import java.io.IOException; |
6 |
|
|
import java.util.HashMap; |
7 |
duarte |
216 |
import java.util.Locale; |
8 |
duarte |
123 |
import java.util.TreeMap; |
9 |
|
|
import java.util.ArrayList; |
10 |
|
|
import java.util.Collections; |
11 |
|
|
|
12 |
duarte |
226 |
import javax.vecmath.Point3d; |
13 |
|
|
import javax.vecmath.Point3i; |
14 |
|
|
|
15 |
duarte |
207 |
/** |
16 |
|
|
* A single chain pdb protein structure |
17 |
|
|
* |
18 |
|
|
* @author Jose Duarte |
19 |
|
|
* Class: Pdb |
20 |
|
|
* Package: proteinstructure |
21 |
|
|
*/ |
22 |
|
|
public abstract class Pdb { |
23 |
duarte |
123 |
|
24 |
duarte |
208 |
protected final static int DEFAULT_MODEL=1; // default model serial (NMR structures) |
25 |
|
|
public final static String NONSTANDARD_AA_LETTER="X"; // letter we assign to nonstandard aas to use in sequence |
26 |
duarte |
153 |
|
27 |
duarte |
208 |
protected HashMap<String,Integer> resser_atom2atomserial; // residue serial+atom name (separated by underscore) to atom serials |
28 |
|
|
protected HashMap<Integer,String> resser2restype; // residue serial to 3 letter residue type |
29 |
duarte |
226 |
protected HashMap<Integer,Point3d> atomser2coord; // atom serials to 3D coordinates |
30 |
duarte |
208 |
protected HashMap<Integer,Integer> atomser2resser; // atom serials to residue serials |
31 |
|
|
protected HashMap<String,Integer> pdbresser2resser; // pdb (author) residue serials (can include insetion codes so they are strings) to internal residue serials |
32 |
|
|
protected HashMap<Integer,String> resser2pdbresser; // internal residue serials to pdb (author) residue serials (can include insertion codes so they are strings) |
33 |
duarte |
190 |
|
34 |
duarte |
222 |
protected HashMap<Integer,String> resser2secstruct; // residue serials to secondary structure |
35 |
|
|
protected TreeMap<String,Interval> secstruct2resinterval;// secondary structure element to residue serial intervals |
36 |
duarte |
219 |
|
37 |
duarte |
208 |
protected HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms(); // contains atom names for each aminoacid |
38 |
duarte |
135 |
|
39 |
duarte |
208 |
protected String sequence; // full sequence as it appears in SEQRES field |
40 |
|
|
protected String pdbCode; |
41 |
|
|
// given "external" pdb chain code, i.e. the classic (author's) pdb code ("NULL" if it is blank in original pdb file) |
42 |
|
|
protected String pdbChainCode; |
43 |
|
|
// Our internal chain identifier: |
44 |
duarte |
135 |
// - in reading from pdbase or from msdsd it will be set to the internal chain id (asym_id field for pdbase, pchain_id for msdsd) |
45 |
duarte |
208 |
// - in reading from pdb file it coincides with pdbChainCode except for "NULL" where we use "A" |
46 |
duarte |
207 |
protected String chainCode; |
47 |
duarte |
208 |
protected int model; // the model serial for NMR structures |
48 |
duarte |
123 |
|
49 |
duarte |
208 |
protected String db; // the db from which we have taken the data (if applies) |
50 |
duarte |
153 |
|
51 |
duarte |
208 |
|
52 |
|
|
|
53 |
duarte |
133 |
|
54 |
duarte |
153 |
/** |
55 |
|
|
* Dumps coordinate data into a file in pdb format (ATOM lines only) |
56 |
duarte |
206 |
* The chain dumped is the value of the chainCode variable, i.e. our internal chain identifier for Pdb objects |
57 |
duarte |
153 |
* @param outfile |
58 |
|
|
* @throws IOException |
59 |
|
|
*/ |
60 |
duarte |
123 |
public void dump2pdbfile(String outfile) throws IOException { |
61 |
duarte |
130 |
TreeMap<Integer,Object[]> lines = new TreeMap<Integer,Object[]>(); |
62 |
duarte |
123 |
PrintStream Out = new PrintStream(new FileOutputStream(outfile)); |
63 |
duarte |
206 |
Out.println("HEADER Dumped from "+db+". pdb code="+pdbCode+", chain='"+chainCode+"'"); |
64 |
duarte |
123 |
for (String resser_atom:resser_atom2atomserial.keySet()){ |
65 |
|
|
int atomserial = resser_atom2atomserial.get(resser_atom); |
66 |
|
|
int res_serial = Integer.parseInt(resser_atom.split("_")[0]); |
67 |
|
|
String atom = resser_atom.split("_")[1]; |
68 |
|
|
String res_type = resser2restype.get(res_serial); |
69 |
duarte |
226 |
Point3d coords = atomser2coord.get(atomserial); |
70 |
|
|
Object[] fields = {atomserial, atom, res_type, chainCode, res_serial, coords.x, coords.y, coords.z}; |
71 |
duarte |
130 |
lines.put(atomserial, fields); |
72 |
duarte |
123 |
} |
73 |
duarte |
130 |
for (int atomserial:lines.keySet()){ |
74 |
duarte |
216 |
// Local.US is necessary, otherwise java prints the doubles locale-dependant (i.e. with ',' for some locales) |
75 |
|
|
Out.printf(Locale.US,"ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",lines.get(atomserial)); |
76 |
duarte |
130 |
} |
77 |
duarte |
123 |
Out.println("END"); |
78 |
|
|
Out.close(); |
79 |
|
|
} |
80 |
|
|
|
81 |
|
|
public void dumpseq(String seqfile) throws IOException { |
82 |
|
|
PrintStream Out = new PrintStream(new FileOutputStream(seqfile)); |
83 |
duarte |
206 |
Out.println(">"+pdbCode+"_"+pdbChainCode); |
84 |
duarte |
123 |
Out.println(sequence); |
85 |
|
|
Out.close(); |
86 |
|
|
} |
87 |
|
|
|
88 |
|
|
public int get_length(){ |
89 |
|
|
return resser2restype.size(); |
90 |
|
|
} |
91 |
|
|
|
92 |
duarte |
226 |
public TreeMap<Integer,Point3d> get_coords_for_ct(String ct) { |
93 |
|
|
TreeMap<Integer,Point3d> coords = new TreeMap<Integer,Point3d>(); |
94 |
duarte |
123 |
HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct); |
95 |
|
|
for (int resser:resser2restype.keySet()){ |
96 |
|
|
String[] atoms = restype2atoms.get(resser2restype.get(resser)); |
97 |
|
|
for (String atom:atoms){ |
98 |
|
|
if (resser_atom2atomserial.containsKey(resser+"_"+atom)){ |
99 |
|
|
int atomser = resser_atom2atomserial.get(resser+"_"+atom); |
100 |
duarte |
226 |
Point3d coord = atomser2coord.get(atomser); |
101 |
duarte |
123 |
coords.put(atomser, coord); |
102 |
|
|
} |
103 |
|
|
else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){ |
104 |
|
|
int atomser = resser_atom2atomserial.get(resser+"_"+"OXT"); |
105 |
duarte |
226 |
Point3d coord = atomser2coord.get(atomser); |
106 |
duarte |
123 |
coords.put(atomser, coord); |
107 |
|
|
} |
108 |
|
|
else { |
109 |
|
|
System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser."); |
110 |
|
|
} |
111 |
|
|
} |
112 |
|
|
} |
113 |
|
|
return coords; |
114 |
|
|
} |
115 |
duarte |
229 |
|
116 |
|
|
/** |
117 |
|
|
* Returns the distance matrix as a HashMap with Contacts (residue serial pairs) as keys |
118 |
|
|
* It doesn't make sense to call this method for multi atom contact |
119 |
|
|
* types (for each residue serial pair there's more than 1 distance) |
120 |
|
|
* Thus before calling this we should check AA.isValidSingleAtomCT(ct) |
121 |
|
|
* @param ct the contact type |
122 |
|
|
* @return |
123 |
|
|
*/ |
124 |
duarte |
224 |
public HashMap<Contact, Double> calculate_dist_matrix(String ct){ |
125 |
duarte |
229 |
HashMap<Contact,Double> distMatrixAtoms = new HashMap<Contact,Double>(); |
126 |
duarte |
123 |
if (!ct.contains("/")){ |
127 |
duarte |
226 |
TreeMap<Integer,Point3d> coords = get_coords_for_ct(ct); |
128 |
duarte |
123 |
for (int i_atomser:coords.keySet()){ |
129 |
|
|
for (int j_atomser:coords.keySet()){ |
130 |
|
|
if (j_atomser>i_atomser) { |
131 |
|
|
Contact pair = new Contact(i_atomser,j_atomser); |
132 |
duarte |
229 |
distMatrixAtoms.put(pair, coords.get(i_atomser).distance(coords.get(j_atomser))); |
133 |
duarte |
123 |
} |
134 |
|
|
} |
135 |
|
|
} |
136 |
|
|
} else { |
137 |
|
|
String i_ct = ct.split("/")[0]; |
138 |
|
|
String j_ct = ct.split("/")[1]; |
139 |
duarte |
226 |
TreeMap<Integer,Point3d> i_coords = get_coords_for_ct(i_ct); |
140 |
|
|
TreeMap<Integer,Point3d> j_coords = get_coords_for_ct(j_ct); |
141 |
duarte |
123 |
for (int i_atomser:i_coords.keySet()){ |
142 |
|
|
for (int j_atomser:j_coords.keySet()){ |
143 |
|
|
if (j_atomser!=i_atomser){ |
144 |
|
|
Contact pair = new Contact(i_atomser,j_atomser); |
145 |
duarte |
229 |
distMatrixAtoms.put(pair, i_coords.get(i_atomser).distance(j_coords.get(j_atomser))); |
146 |
duarte |
123 |
} |
147 |
|
|
} |
148 |
|
|
} |
149 |
|
|
} |
150 |
duarte |
229 |
|
151 |
|
|
HashMap<Contact,Double> distMatrixRes = new HashMap<Contact, Double>(); |
152 |
|
|
for (Contact cont: distMatrixAtoms.keySet()){ |
153 |
|
|
int i_resser = get_resser_from_atomser(cont.i); |
154 |
|
|
int j_resser = get_resser_from_atomser(cont.j); |
155 |
|
|
distMatrixRes.put(new Contact(i_resser,j_resser), distMatrixAtoms.get(cont)); |
156 |
|
|
} |
157 |
|
|
|
158 |
|
|
return distMatrixRes; |
159 |
duarte |
123 |
} |
160 |
|
|
|
161 |
duarte |
129 |
/** |
162 |
duarte |
226 |
* Get the graph for given contact type and cutoff for this Pdb object. |
163 |
|
|
* Returns a Graph object with the contacts |
164 |
|
|
* We do geometric hashing for fast contact computation (without needing to calculate full distance matrix) |
165 |
|
|
* @param ct |
166 |
|
|
* @param cutoff |
167 |
|
|
* @return |
168 |
|
|
*/ |
169 |
|
|
@SuppressWarnings("unchecked") |
170 |
duarte |
228 |
public Graph get_graph(String ct, double cutoff){ |
171 |
|
|
TreeMap<Integer,Point3d> i_coords = null; |
172 |
|
|
TreeMap<Integer,Point3d> j_coords = null; |
173 |
|
|
boolean directed = false; |
174 |
|
|
if (!ct.contains("/")){ |
175 |
|
|
i_coords = get_coords_for_ct(ct); |
176 |
|
|
directed = false; |
177 |
|
|
} else { |
178 |
|
|
String i_ct = ct.split("/")[0]; |
179 |
|
|
String j_ct = ct.split("/")[1]; |
180 |
|
|
i_coords = get_coords_for_ct(i_ct); |
181 |
|
|
j_coords = get_coords_for_ct(j_ct); |
182 |
|
|
directed = true; |
183 |
|
|
} |
184 |
|
|
int[] i_atomserials = new int[i_coords.size()]; // map from matrix indices to atomserials |
185 |
|
|
int[] j_atomserials = null; |
186 |
duarte |
226 |
|
187 |
|
|
int SCALE=100; // i.e. we use units of hundredths of Amstrongs (thus cutoffs can be specified with a maximum precission of 0.01A) |
188 |
|
|
|
189 |
|
|
int boxSize = (int) Math.floor(cutoff*SCALE); |
190 |
|
|
|
191 |
|
|
HashMap<Point3i,Box> boxes = new HashMap<Point3i,Box>(); |
192 |
|
|
int i=0; |
193 |
duarte |
228 |
for (int i_atomser:i_coords.keySet()){ |
194 |
duarte |
226 |
//coordinates for atom serial atomser, we will use i as its identifier below |
195 |
duarte |
228 |
Point3d coord = i_coords.get(i_atomser); |
196 |
duarte |
226 |
int floorX = boxSize*((int)Math.floor(coord.x*SCALE/boxSize)); |
197 |
|
|
int floorY = boxSize*((int)Math.floor(coord.y*SCALE/boxSize)); |
198 |
|
|
int floorZ = boxSize*((int)Math.floor(coord.z*SCALE/boxSize)); |
199 |
|
|
Point3i floor = new Point3i(floorX,floorY,floorZ); |
200 |
|
|
if (boxes.containsKey(floor)){ |
201 |
|
|
// we put the coords for atom i in its corresponding box (identified by floor) |
202 |
duarte |
228 |
boxes.get(floor).put_i_Point(i, coord); |
203 |
|
|
if (!directed){ |
204 |
|
|
boxes.get(floor).put_j_Point(i, coord); |
205 |
|
|
} |
206 |
duarte |
226 |
} else { |
207 |
|
|
Box box = new Box(floor); |
208 |
duarte |
228 |
box.put_i_Point(i, coord); |
209 |
|
|
if (!directed){ |
210 |
|
|
box.put_j_Point(i, coord); |
211 |
|
|
} |
212 |
duarte |
226 |
boxes.put(floor,box); |
213 |
|
|
} |
214 |
duarte |
228 |
i_atomserials[i]=i_atomser; //as atomserials in coords were ordered (TreeMap) the new indexing will still be ordered |
215 |
duarte |
226 |
i++; |
216 |
|
|
} |
217 |
duarte |
228 |
int j=0; |
218 |
|
|
if (directed) { |
219 |
|
|
j_atomserials = new int[j_coords.size()]; |
220 |
|
|
for (int j_atomser:j_coords.keySet()){ |
221 |
|
|
//coordinates for atom serial atomser, we will use j as its identifier below |
222 |
|
|
Point3d coord = j_coords.get(j_atomser); |
223 |
|
|
int floorX = boxSize*((int)Math.floor(coord.x*SCALE/boxSize)); |
224 |
|
|
int floorY = boxSize*((int)Math.floor(coord.y*SCALE/boxSize)); |
225 |
|
|
int floorZ = boxSize*((int)Math.floor(coord.z*SCALE/boxSize)); |
226 |
|
|
Point3i floor = new Point3i(floorX,floorY,floorZ); |
227 |
|
|
if (boxes.containsKey(floor)){ |
228 |
|
|
// we put the coords for atom j in its corresponding box (identified by floor) |
229 |
|
|
boxes.get(floor).put_j_Point(j, coord); |
230 |
|
|
} else { |
231 |
|
|
Box box = new Box(floor); |
232 |
|
|
box.put_j_Point(j, coord); |
233 |
|
|
boxes.put(floor,box); |
234 |
|
|
} |
235 |
|
|
j_atomserials[j]=j_atomser; //as atomserials in coords were ordered (TreeMap) the new indexing will still be ordered |
236 |
|
|
j++; |
237 |
|
|
} |
238 |
|
|
} else { |
239 |
|
|
j_atomserials = i_atomserials; |
240 |
|
|
} |
241 |
|
|
|
242 |
duarte |
226 |
|
243 |
duarte |
228 |
double[][]distMatrix = new double[i_atomserials.length][j_atomserials.length]; |
244 |
duarte |
226 |
|
245 |
|
|
for (Point3i floor:boxes.keySet()){ // for each box |
246 |
|
|
// distances of points within this box |
247 |
duarte |
228 |
boxes.get(floor).getDistancesWithinBox(distMatrix,directed); |
248 |
duarte |
226 |
|
249 |
duarte |
228 |
//TODO should iterate only through half of the neighbours here |
250 |
duarte |
226 |
// distances of points from this box to all neighbouring boxes: 26 iterations (26 neighbouring boxes) |
251 |
|
|
for (int x=floor.x-boxSize;x<=floor.x+boxSize;x+=boxSize){ |
252 |
|
|
for (int y=floor.y-boxSize;y<=floor.y+boxSize;y+=boxSize){ |
253 |
|
|
for (int z=floor.z-boxSize;z<=floor.z+boxSize;z+=boxSize){ |
254 |
|
|
if (!((x==floor.x)&&(y==floor.y)&&(z==floor.z))) { // skip this box |
255 |
|
|
Point3i neighbor = new Point3i(x,y,z); |
256 |
|
|
if (boxes.containsKey(neighbor)){ |
257 |
duarte |
228 |
boxes.get(floor).getDistancesToNeighborBox(boxes.get(neighbor),distMatrix,directed); |
258 |
duarte |
226 |
} |
259 |
|
|
} |
260 |
|
|
} |
261 |
|
|
} |
262 |
|
|
} |
263 |
|
|
} |
264 |
|
|
|
265 |
duarte |
228 |
// getting the contacts (in residue serials) from the atom serials (partial) distance matrix |
266 |
duarte |
226 |
ContactList contacts = new ContactList(); |
267 |
|
|
for (i=0;i<distMatrix.length;i++){ |
268 |
duarte |
228 |
for (j=0;j<distMatrix[i].length;j++){ |
269 |
|
|
// the condition distMatrix[i][j]!=0.0 takes care of skipping several things: |
270 |
|
|
// - diagonal of the matrix in case of undirected |
271 |
|
|
// - lower half of matrix in case of undirected |
272 |
|
|
// - cells for which we didn't calculate a distance as the 2 points were not in same or neighbouring boxes (i.e. too far apart) |
273 |
|
|
if (distMatrix[i][j]!=0.0 && distMatrix[i][j]<=cutoff){ |
274 |
|
|
int i_resser = atomser2resser.get(i_atomserials[i]); |
275 |
|
|
int j_resser = atomser2resser.get(j_atomserials[j]); |
276 |
duarte |
226 |
Contact resser_pair = new Contact(i_resser,j_resser); |
277 |
duarte |
228 |
// for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates |
278 |
duarte |
226 |
if (i_resser!=j_resser && (! contacts.contains(resser_pair))){ |
279 |
|
|
contacts.add(resser_pair); |
280 |
|
|
} |
281 |
|
|
} |
282 |
duarte |
228 |
|
283 |
duarte |
226 |
} |
284 |
|
|
} |
285 |
duarte |
228 |
|
286 |
|
|
// creating and returning the graph object |
287 |
duarte |
226 |
Collections.sort(contacts); |
288 |
|
|
TreeMap<Integer,String> nodes = new TreeMap<Integer,String>(); |
289 |
|
|
for (int resser:resser2restype.keySet()){ |
290 |
|
|
nodes.put(resser,resser2restype.get(resser)); |
291 |
|
|
} |
292 |
|
|
Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,pdbCode,chainCode,pdbChainCode); |
293 |
|
|
|
294 |
|
|
return graph; |
295 |
|
|
} |
296 |
|
|
|
297 |
duarte |
190 |
public int get_resser_from_pdbresser (String pdbresser){ |
298 |
|
|
return pdbresser2resser.get(pdbresser); |
299 |
|
|
} |
300 |
|
|
|
301 |
|
|
public String get_pdbresser_from_resser (int resser){ |
302 |
|
|
return resser2pdbresser.get(resser); |
303 |
|
|
} |
304 |
|
|
|
305 |
duarte |
200 |
public int get_resser_from_atomser(int atomser){ |
306 |
|
|
return atomser2resser.get(atomser); |
307 |
|
|
} |
308 |
duarte |
206 |
|
309 |
|
|
public String getChainCode(){ |
310 |
|
|
return this.chainCode; |
311 |
|
|
} |
312 |
|
|
|
313 |
|
|
public String getPdbChainCode(){ |
314 |
|
|
return this.pdbChainCode; |
315 |
|
|
} |
316 |
duarte |
219 |
|
317 |
|
|
public String getSecStructure(int resser){ |
318 |
|
|
return this.resser2secstruct.get(resser); |
319 |
|
|
} |
320 |
duarte |
222 |
|
321 |
|
|
public TreeMap<String,Interval> getAllSecStructElements(){ |
322 |
|
|
return secstruct2resinterval; |
323 |
|
|
} |
324 |
duarte |
123 |
} |