1 |
duarte |
123 |
package proteinstructure; |
2 |
|
|
|
3 |
duarte |
127 |
import java.io.BufferedReader; |
4 |
|
|
import java.io.File; |
5 |
duarte |
123 |
import java.io.FileOutputStream; |
6 |
duarte |
127 |
import java.io.FileReader; |
7 |
duarte |
123 |
import java.io.PrintStream; |
8 |
duarte |
127 |
import java.io.FileNotFoundException; |
9 |
duarte |
123 |
import java.io.IOException; |
10 |
|
|
import java.util.HashMap; |
11 |
|
|
import java.util.TreeMap; |
12 |
|
|
import java.util.ArrayList; |
13 |
|
|
import java.util.Collections; |
14 |
duarte |
127 |
import java.util.regex.Matcher; |
15 |
|
|
import java.util.regex.Pattern; |
16 |
duarte |
123 |
|
17 |
|
|
public class Pdb { |
18 |
|
|
|
19 |
|
|
HashMap<String,Integer> resser_atom2atomserial; |
20 |
|
|
HashMap<Integer,String> resser2restype; |
21 |
|
|
HashMap<Integer,Double[]> atomser2coord; |
22 |
|
|
HashMap<Integer,Integer> atomser2resser; |
23 |
|
|
HashMap<String,ArrayList<String>> aas2atoms = AA.getaas2atoms(); |
24 |
|
|
String sequence=""; |
25 |
|
|
String accode=""; |
26 |
|
|
String chaincode=""; |
27 |
|
|
String db; |
28 |
|
|
String chain; |
29 |
|
|
|
30 |
duarte |
124 |
public Pdb (String accode, String chaincode) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError { |
31 |
duarte |
123 |
this(accode,chaincode,PdbaseInfo.pdbaseDB); |
32 |
|
|
|
33 |
|
|
} |
34 |
|
|
|
35 |
duarte |
124 |
public Pdb (String accode, String chaincode, String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError { |
36 |
duarte |
123 |
this.accode=accode; |
37 |
|
|
this.chaincode=chaincode; |
38 |
|
|
this.db=db; |
39 |
|
|
this.chain=chaincode; // we initialise it to chaincode, in read_pdb_data_from_pdbase gets reset to the right internal chain id |
40 |
|
|
read_pdb_data_from_pdbase(db); |
41 |
|
|
} |
42 |
|
|
|
43 |
duarte |
127 |
public Pdb (String pdbfile) throws FileNotFoundException, IOException { |
44 |
|
|
this.accode=""; |
45 |
|
|
read_pdb_data_from_file(pdbfile); |
46 |
duarte |
123 |
} |
47 |
|
|
|
48 |
duarte |
124 |
public void read_pdb_data_from_pdbase(String db) throws PdbaseInconsistencyError, PdbaseAcCodeNotFoundError{ |
49 |
duarte |
123 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
50 |
|
|
resser2restype = new HashMap<Integer,String>(); |
51 |
|
|
atomser2coord = new HashMap<Integer,Double[]>(); |
52 |
|
|
atomser2resser = new HashMap<Integer,Integer>(); |
53 |
|
|
|
54 |
|
|
PdbaseInfo mypdbaseinfo = new PdbaseInfo(accode,chaincode,db); |
55 |
|
|
ArrayList<ArrayList> resultset = mypdbaseinfo.read_atomData(); |
56 |
|
|
sequence = mypdbaseinfo.read_seq(); |
57 |
|
|
mypdbaseinfo.close(); |
58 |
|
|
|
59 |
|
|
for (ArrayList result:resultset){ |
60 |
|
|
int atomserial = (Integer) result.get(0); |
61 |
|
|
String atom = (String) result.get(1); |
62 |
|
|
String res_type = (String) result.get(2); |
63 |
|
|
chain=(String) result.get(3); |
64 |
|
|
int res_serial = (Integer) result.get(4); |
65 |
|
|
double x = (Double) result.get(5); |
66 |
|
|
double y = (Double) result.get(6); |
67 |
|
|
double z = (Double) result.get(7); |
68 |
|
|
Double[] coords = {x, y, z}; |
69 |
|
|
ArrayList<String> aalist=AA.aas(); |
70 |
|
|
if (aalist.contains(res_type)) { |
71 |
|
|
atomser2coord.put(atomserial, coords); |
72 |
|
|
atomser2resser.put(atomserial, res_serial); |
73 |
|
|
resser2restype.put(res_serial, res_type); |
74 |
|
|
ArrayList<String> atomlist = aas2atoms.get(res_type); |
75 |
|
|
if (atomlist.contains(atom)){ |
76 |
|
|
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
77 |
|
|
} |
78 |
|
|
} |
79 |
|
|
} |
80 |
|
|
} |
81 |
|
|
|
82 |
duarte |
127 |
public void read_pdb_data_from_file(String pdbfile) throws FileNotFoundException, IOException{ |
83 |
|
|
resser_atom2atomserial = new HashMap<String,Integer>(); |
84 |
|
|
resser2restype = new HashMap<Integer,String>(); |
85 |
|
|
atomser2coord = new HashMap<Integer,Double[]>(); |
86 |
|
|
atomser2resser = new HashMap<Integer,Integer>(); |
87 |
|
|
|
88 |
|
|
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
89 |
|
|
String line; |
90 |
|
|
while ((line = fpdb.readLine() ) != null ) { |
91 |
|
|
Pattern p = Pattern.compile("^ATOM"); |
92 |
|
|
Matcher m = p.matcher(line); |
93 |
|
|
if (m.find()){ |
94 |
|
|
Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{2}(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE); |
95 |
|
|
Matcher ml = pl.matcher(line); |
96 |
|
|
if (ml.find()) { |
97 |
|
|
int atomserial=Integer.parseInt(ml.group(1).trim()); |
98 |
|
|
String atom = ml.group(2).trim(); |
99 |
|
|
String res_type = ml.group(3).trim(); |
100 |
|
|
int res_serial = Integer.parseInt(ml.group(4).trim()); |
101 |
|
|
double x = Double.parseDouble(ml.group(5).trim()); |
102 |
|
|
double y = Double.parseDouble(ml.group(6).trim()); |
103 |
|
|
double z = Double.parseDouble(ml.group(7).trim()); |
104 |
|
|
Double[] coords = {x, y, z}; |
105 |
|
|
ArrayList<String> aalist=AA.aas(); |
106 |
|
|
if (aalist.contains(res_type)) { |
107 |
|
|
atomser2coord.put(atomserial, coords); |
108 |
|
|
atomser2resser.put(atomserial, res_serial); |
109 |
|
|
resser2restype.put(res_serial, res_type); |
110 |
|
|
ArrayList<String> atomlist = aas2atoms.get(res_type); |
111 |
|
|
if (atomlist.contains(atom)){ |
112 |
|
|
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
113 |
|
|
} |
114 |
|
|
} |
115 |
|
|
} |
116 |
|
|
} |
117 |
|
|
} |
118 |
|
|
fpdb.close(); |
119 |
|
|
// now we read the sequence from the resser2restype HashMap |
120 |
|
|
// NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here! |
121 |
|
|
ArrayList<Integer> ressers = new ArrayList<Integer>(); |
122 |
|
|
for (int resser:resser2restype.keySet()) { |
123 |
|
|
ressers.add(resser); |
124 |
|
|
} |
125 |
|
|
Collections.sort(ressers); |
126 |
|
|
sequence=""; |
127 |
|
|
for (int resser:ressers){ |
128 |
|
|
String oneletter = AA.threeletter2oneletter(resser2restype.get(resser)); |
129 |
|
|
sequence += oneletter; |
130 |
|
|
} |
131 |
|
|
// finally we set accode and chaincode to unknown |
132 |
|
|
//TODO: we should parse accode and chaincode from appropriate fields in pdb file, |
133 |
|
|
// problem: in case of a non-original pdb file there won't be accession code |
134 |
|
|
accode="?"; |
135 |
|
|
chaincode="?"; |
136 |
|
|
} |
137 |
|
|
|
138 |
duarte |
123 |
public void dump2pdbfile(String outfile) throws IOException { |
139 |
|
|
String chainstr=chain; |
140 |
|
|
if (chain.equals("NULL")){ |
141 |
|
|
chainstr="A"; |
142 |
|
|
} |
143 |
|
|
PrintStream Out = new PrintStream(new FileOutputStream(outfile)); |
144 |
|
|
Out.println("HEADER Dumped from "+db+". pdb accession code="+accode+", pdb chain code="+chaincode); |
145 |
|
|
for (String resser_atom:resser_atom2atomserial.keySet()){ |
146 |
|
|
int atomserial = resser_atom2atomserial.get(resser_atom); |
147 |
|
|
int res_serial = Integer.parseInt(resser_atom.split("_")[0]); |
148 |
|
|
String atom = resser_atom.split("_")[1]; |
149 |
|
|
String res_type = resser2restype.get(res_serial); |
150 |
|
|
Double[] coords = atomser2coord.get(atomserial); |
151 |
|
|
Object[] fields = {atomserial, atom, res_type, chainstr, res_serial, coords[0], coords[1], coords[2]}; |
152 |
duarte |
127 |
Out.printf("ATOM %5d %3s %3s %1s%4d %8.3f%8.3f%8.3f\n",fields); |
153 |
duarte |
123 |
} |
154 |
|
|
Out.println("END"); |
155 |
|
|
Out.close(); |
156 |
|
|
} |
157 |
|
|
|
158 |
|
|
public void dumpseq(String seqfile) throws IOException { |
159 |
|
|
PrintStream Out = new PrintStream(new FileOutputStream(seqfile)); |
160 |
|
|
Out.println(">"+accode+"_"+chaincode); |
161 |
|
|
Out.println(sequence); |
162 |
|
|
Out.close(); |
163 |
|
|
} |
164 |
|
|
|
165 |
|
|
public int get_length(){ |
166 |
|
|
return resser2restype.size(); |
167 |
|
|
} |
168 |
|
|
|
169 |
|
|
public HashMap<Integer,Double[]> get_coords_for_ct(String ct) { |
170 |
|
|
HashMap<Integer,Double[]> coords = new HashMap<Integer,Double[]>(); |
171 |
|
|
HashMap<String,String[]> restype2atoms = AA.ct2atoms(ct); |
172 |
|
|
for (int resser:resser2restype.keySet()){ |
173 |
|
|
String[] atoms = restype2atoms.get(resser2restype.get(resser)); |
174 |
|
|
for (String atom:atoms){ |
175 |
|
|
if (resser_atom2atomserial.containsKey(resser+"_"+atom)){ |
176 |
|
|
int atomser = resser_atom2atomserial.get(resser+"_"+atom); |
177 |
|
|
Double[] coord = atomser2coord.get(atomser); |
178 |
|
|
coords.put(atomser, coord); |
179 |
|
|
} |
180 |
|
|
else if (atom.equals("O") && resser_atom2atomserial.containsKey(resser+"_"+"OXT")){ |
181 |
|
|
int atomser = resser_atom2atomserial.get(resser+"_"+"OXT"); |
182 |
|
|
Double[] coord = atomser2coord.get(atomser); |
183 |
|
|
coords.put(atomser, coord); |
184 |
|
|
} |
185 |
|
|
else { |
186 |
|
|
System.err.println("Couldn't find "+atom+" atom for resser="+resser+". Continuing without that atom for this resser."); |
187 |
|
|
} |
188 |
|
|
} |
189 |
|
|
} |
190 |
|
|
return coords; |
191 |
|
|
} |
192 |
|
|
|
193 |
|
|
public TreeMap<Contact, Double> calculate_dist_matrix(String ct){ |
194 |
|
|
TreeMap<Contact,Double> dist_matrix = new TreeMap<Contact,Double>(); |
195 |
|
|
if (!ct.contains("/")){ |
196 |
|
|
HashMap<Integer,Double[]> coords = get_coords_for_ct(ct); |
197 |
|
|
for (int i_atomser:coords.keySet()){ |
198 |
|
|
for (int j_atomser:coords.keySet()){ |
199 |
|
|
if (j_atomser>i_atomser) { |
200 |
|
|
Contact pair = new Contact(i_atomser,j_atomser); |
201 |
|
|
dist_matrix.put(pair, distance(coords.get(i_atomser),coords.get(j_atomser))); |
202 |
|
|
} |
203 |
|
|
} |
204 |
|
|
} |
205 |
|
|
} else { |
206 |
|
|
String i_ct = ct.split("/")[0]; |
207 |
|
|
String j_ct = ct.split("/")[1]; |
208 |
|
|
HashMap<Integer,Double[]> i_coords = get_coords_for_ct(i_ct); |
209 |
|
|
HashMap<Integer,Double[]> j_coords = get_coords_for_ct(j_ct); |
210 |
|
|
for (int i_atomser:i_coords.keySet()){ |
211 |
|
|
for (int j_atomser:j_coords.keySet()){ |
212 |
|
|
if (j_atomser!=i_atomser){ |
213 |
|
|
Contact pair = new Contact(i_atomser,j_atomser); |
214 |
|
|
dist_matrix.put(pair, distance(i_coords.get(i_atomser),j_coords.get(j_atomser))); |
215 |
|
|
} |
216 |
|
|
} |
217 |
|
|
} |
218 |
|
|
} |
219 |
|
|
return dist_matrix; |
220 |
|
|
} |
221 |
|
|
|
222 |
duarte |
129 |
/** |
223 |
|
|
* Get the contacts for given contact type and cutoff for this Pdb object. |
224 |
|
|
* Returns a Graph object with the contacts |
225 |
|
|
* The graph is always directional, i.e. in crossed cases xx/yy: i corresponds to xx and j to yy |
226 |
|
|
* ct here can be single contact type (e.g. Ca, BB) or crossed (e.g. BB/SC or Ca/Cb) |
227 |
|
|
* @param ct |
228 |
|
|
* @param cutoff |
229 |
|
|
* @return |
230 |
|
|
*/ |
231 |
duarte |
123 |
public Graph get_graph(String ct, double cutoff){ |
232 |
|
|
TreeMap<Contact,Double> dist_matrix = calculate_dist_matrix(ct); |
233 |
|
|
ArrayList<Contact> contacts = new ArrayList<Contact>(); |
234 |
duarte |
129 |
// we loop here over all indices of dist_matrix, |
235 |
|
|
// we took care already that in symmetric cases (i.e. single contact type, not crossed) we have only one side of the matrix and |
236 |
|
|
// in asymmetrical cases (i.e. crossed cases) we have both sides of the matrix |
237 |
duarte |
123 |
for (Contact pair:dist_matrix.keySet()){ |
238 |
|
|
int i_atomser=pair.i; |
239 |
|
|
int j_atomser=pair.j; |
240 |
|
|
if (dist_matrix.get(pair)<=cutoff){ |
241 |
|
|
int i_resser = atomser2resser.get(i_atomser); |
242 |
|
|
int j_resser = atomser2resser.get(j_atomser); |
243 |
|
|
Contact resser_pair = new Contact(i_resser,j_resser); |
244 |
duarte |
129 |
// for multi-atom models (BB, SC, ALL or BB/SC) we need to make sure that we don't have contacts from residue to itself or that we don't have duplicates |
245 |
duarte |
123 |
if (i_resser!=j_resser && (! contacts.contains(resser_pair))){ |
246 |
|
|
contacts.add(resser_pair); |
247 |
|
|
} |
248 |
|
|
} |
249 |
|
|
} |
250 |
|
|
Collections.sort(contacts); |
251 |
duarte |
129 |
TreeMap<Integer,String> nodes = new TreeMap<Integer,String>(); |
252 |
|
|
for (int resser:resser2restype.keySet()){ |
253 |
|
|
nodes.put(resser,resser2restype.get(resser)); |
254 |
|
|
} |
255 |
|
|
// NOTE: we pass to the graph object the chain (internal pdbase id) not the pdb chain code! |
256 |
|
|
Graph graph = new Graph (contacts,nodes,sequence,cutoff,ct,accode,chain); |
257 |
duarte |
123 |
return graph; |
258 |
|
|
} |
259 |
|
|
|
260 |
|
|
public Double distance(Double[] coords1, Double[] coords2){ |
261 |
|
|
return Math.sqrt(Math.pow((coords1[0]-coords2[0]),2)+Math.pow((coords1[1]-coords2[1]),2)+Math.pow((coords1[2]-coords2[2]),2)); |
262 |
|
|
} |
263 |
|
|
} |