1 |
import graphAveraging.GraphAverager; |
2 |
|
3 |
import java.io.*; |
4 |
import java.util.*; |
5 |
import proteinstructure.*; |
6 |
import tinker.TinkerError; |
7 |
import tinker.TinkerRunner; |
8 |
|
9 |
public class testGraphAverager { |
10 |
|
11 |
/** |
12 |
* @param args |
13 |
*/ |
14 |
public static void main(String[] args) { |
15 |
// Plan: |
16 |
// use getopt to parse the following parameters |
17 |
// - original structure (optional, for benchmarking) |
18 |
// - file with list of structures for averaging (required) |
19 |
// - contact type & distance cutoff (both required) |
20 |
// - consensus edge threshold (defaults to 0.5?) |
21 |
// - run tinker or not |
22 |
// - parameters for tinker model picking (optional) |
23 |
// - number of tinker models to output (only if run tinker is set, defaults to one) |
24 |
// - output benchmark result (graph comparison or structure comparison, only if original structure is specified) |
25 |
// - output file for resulting graph (optional) |
26 |
|
27 |
// take pdb files from command line (later: use getopt) |
28 |
// create trivial alignment (check alignment) |
29 |
// create graphAverager |
30 |
// create consensus graph |
31 |
// compare with graph for native structure (first argument) |
32 |
|
33 |
// do tinker reconstruction |
34 |
// compare with native structure |
35 |
|
36 |
// benchmarking: |
37 |
// - use score of best model vs. native (either graph based or structure based) |
38 |
// - use matlab to optimize input parameters |
39 |
// - create matlab executable for optimizing command line parameters: |
40 |
// * specify command line to be run and constant parameters |
41 |
// * specify parameter ranges to be optimized (discrete numerical, continuous numerical with steps, discrete set) |
42 |
// * specify regexp to parse result from output (stdout or file) |
43 |
// * specify optimization procedure (use default matlab ones, e.g. each parameter separate, genetic algorithm, simul. annealing, ...) |
44 |
// * specify convergence threshold or timeout |
45 |
// * run on cluster? |
46 |
|
47 |
// Input options: |
48 |
// - edgeType |
49 |
// - distCutoff |
50 |
// - targetPdb |
51 |
// - targetChainCode |
52 |
// - list of predictions files |
53 |
// - edge threshold |
54 |
// - min seq separation (for benchmarking and reconstruction) |
55 |
|
56 |
// Output options: |
57 |
// -c write consensus graph |
58 |
// -w write weighted average graph |
59 |
// -t write target graph |
60 |
// -p write result pdb file |
61 |
// -a write accuracy of result vs. native |
62 |
// -o write coverage of results vs. native |
63 |
// -r write rmsd of result vs. native |
64 |
// -b write table of benchmarking results |
65 |
// -v verbose output |
66 |
|
67 |
// read command line parameters |
68 |
if(args.length < 5) { |
69 |
System.out.println("Usage: testGraphAverager <targetPdbFile> <ChainCode> <ListOfPredictionFiles> <edgeThreshold> <minSeqSep>"); |
70 |
System.exit(1); |
71 |
} |
72 |
|
73 |
// constants |
74 |
String maxClusterExecutable = "/project/StruPPi/bin/maxcluster"; |
75 |
String TINKERBINDIR = "/project/StruPPi/Software/tinker/bin"; |
76 |
String PRMFILE = "/project/StruPPi/Software/tinker/amber/amber99.prm"; |
77 |
|
78 |
// input variables |
79 |
File targetFile = new File(args[0]); |
80 |
File listFile = new File(args[2]); |
81 |
String chainCode = args[1]; |
82 |
String thresholdStr = args[3]; |
83 |
String minSeqSepStr = args[4]; |
84 |
|
85 |
// parameters (should all become command line parameters eventually) |
86 |
String contactType = "Cb"; // edge type for graph generation (and reconstruction) |
87 |
double distCutoff = 8.0; // distance cutuff for graph generation (and reconstruction) |
88 |
double graphAveragingThreshold = Double.parseDouble(thresholdStr); // take edges which occur in at least this fraction of models |
89 |
int minSeqSep = Integer.parseInt(minSeqSepStr); // consider only edges with at least this sequence separation |
90 |
String outConsensusGraphFile = "consensusGraph.cm"; // name of consensus graph file |
91 |
String outAverageGraphFile = "averageGraph.cm"; // name of average graph file |
92 |
String outTargetGraphFile = "targetGraph.cm"; // name of target graph file |
93 |
String outStructFile = "predictedStructure.pdb"; // name of structure file |
94 |
int numberOfTinkerModels = 100; // number of models to be reconstructed by Tinker |
95 |
|
96 |
boolean verbose = false; // if false, only results will be written to stdout |
97 |
boolean outputConsensusGraph = false; // whether to write the resulting consensus graph to a file |
98 |
boolean outputAverageGraph = false; // whether to write the weighted average graph to a file |
99 |
boolean outputTargetGraph = false; // whether to write the target graph to a file |
100 |
boolean outputSingleLineResult = false; // whether to output graph measures of consensus graph |
101 |
boolean outputFullResultTable = false; // whether to output a table with graph measures for all inputs |
102 |
boolean doReconstruct = true; // whether to use tinker to create a 3D prediction |
103 |
boolean outputSingleLine3dResult = true; |
104 |
boolean outputPredictedStructure = false; // whether to write reconstructed structure to a file |
105 |
|
106 |
// local variables |
107 |
String targetFileName = targetFile.getAbsolutePath(); // target file name |
108 |
Pdb target = null; // target structure |
109 |
RIGraph targetGraph = null; // target graph |
110 |
Vector<String> templateFileNames = new Vector<String>(); // vector of template file names |
111 |
Vector<RIGraph> models = new Vector<RIGraph>(); // vector of template graphs |
112 |
RIGraph averageGraph; // weighted graph with fraction of occurrance for each edge |
113 |
RIGraph consensusGraph; // consensus graph after applying threshold |
114 |
|
115 |
if(!targetFile.canRead()) { |
116 |
System.err.println("Can not read from file " + targetFileName); |
117 |
System.exit(1); |
118 |
} |
119 |
if(!listFile.canRead()) { |
120 |
System.err.println("Can not read from file " + listFile.getAbsolutePath()); |
121 |
System.exit(1); |
122 |
} |
123 |
|
124 |
// read target graph |
125 |
if(verbose) System.out.println("Reading input files..."); |
126 |
try { |
127 |
target = new PdbfilePdb(targetFile.getAbsolutePath()); |
128 |
target.load(chainCode); |
129 |
targetGraph = target.get_graph(contactType, distCutoff); |
130 |
} catch(PdbLoadError e) { |
131 |
System.err.println("Error while trying to load pdb data from file " + targetFile.getAbsolutePath()+", specific error: "+e.getMessage()); |
132 |
System.exit(1); |
133 |
} |
134 |
|
135 |
// read list of predictions |
136 |
try { |
137 |
BufferedReader in = new BufferedReader(new FileReader(listFile)); |
138 |
String line; |
139 |
File file; |
140 |
Pdb pdb; |
141 |
RIGraph graph; |
142 |
while ((line = in.readLine() ) != null) { |
143 |
file = new File(line); |
144 |
if(!file.canRead()) { |
145 |
System.err.println("File " + line + " not found."); |
146 |
System.exit(1); |
147 |
} else { |
148 |
templateFileNames.add(line); |
149 |
try { |
150 |
pdb = new PdbfilePdb(file.getAbsolutePath()); |
151 |
pdb.load(Pdb.NULL_CHAIN_CODE); |
152 |
graph = pdb.get_graph(contactType, distCutoff); |
153 |
models.add(graph); |
154 |
if(verbose) System.out.print("."); |
155 |
} catch(PdbLoadError e) { |
156 |
System.err.println("Error while trying to load pdb data from file " + file.getAbsolutePath()+", specific error: "+e.getMessage()); |
157 |
System.exit(1); |
158 |
} |
159 |
} |
160 |
} |
161 |
in.close(); |
162 |
|
163 |
} catch(FileNotFoundException e) { |
164 |
System.err.println("File " + listFile.getAbsolutePath() + " not found."); |
165 |
System.exit(1); |
166 |
} catch(IOException e) { |
167 |
System.err.println("Error reading from file " + listFile.getAbsolutePath()); |
168 |
System.exit(1); |
169 |
} |
170 |
if(verbose) System.out.println(); |
171 |
|
172 |
// create trivial alignment and template graphs |
173 |
TreeMap<String,String> sequences = new TreeMap<String, String>(); |
174 |
TreeMap<String, RIGraph> templateGraphs = new TreeMap<String, RIGraph>(); |
175 |
sequences.put(targetFile.getAbsolutePath(), targetGraph.getSequence()); |
176 |
for(int i=0; i < models.size(); i++) { |
177 |
sequences.put(templateFileNames.get(i), models.get(i).getSequence()); |
178 |
templateGraphs.put(templateFileNames.get(i), models.get(i)); |
179 |
} |
180 |
Alignment al = null; |
181 |
try { |
182 |
al = new Alignment(sequences); |
183 |
} catch (AlignmentConstructionError e) { |
184 |
System.err.println("Could not create alignment: " + e.getMessage()); |
185 |
} |
186 |
|
187 |
// create GraphAverager |
188 |
GraphAverager grav = new GraphAverager(targetGraph.getSequence(), al, templateGraphs, targetFile.getAbsolutePath()); |
189 |
if(verbose) System.out.println("Calculating average..."); |
190 |
consensusGraph = grav.getConsensusGraph(graphAveragingThreshold); |
191 |
averageGraph = grav.getAverageGraph(); |
192 |
|
193 |
// compare consensus graph with target |
194 |
PredEval eval = consensusGraph.evaluatePrediction(targetGraph,minSeqSep); |
195 |
|
196 |
// generate result table (consensus vs. individual models) |
197 |
ArrayList<PredEval> evals = new ArrayList<PredEval>(); |
198 |
String targetTitle = "Average"; |
199 |
eval.title = targetTitle; |
200 |
evals.add(eval); |
201 |
for (int i = 0; i < models.size(); i++) { |
202 |
RIGraph g = models.get(i); |
203 |
String title = templateFileNames.get(i).split("/")[templateFileNames.get(i).split("/").length-1]; |
204 |
eval = g.evaluatePrediction(targetGraph,minSeqSep); |
205 |
eval.title = title; |
206 |
evals.add(eval); |
207 |
} |
208 |
// sort results |
209 |
Collections.sort(evals, new Comparator<PredEval>() { |
210 |
public int compare(PredEval e, PredEval e2) { |
211 |
return -((new Double(e.accuracy+e.coverage)).compareTo(new Double(e2.accuracy+e2.coverage))); |
212 |
} |
213 |
}); |
214 |
// find best model accuracy and coverage (excluding average graph) |
215 |
double bestAcc; |
216 |
double bestCov; |
217 |
if(evals.get(1).title!=targetTitle) { |
218 |
bestAcc = evals.get(1).accuracy; |
219 |
bestCov = evals.get(1).coverage; |
220 |
} else { |
221 |
bestAcc = evals.get(2).accuracy; |
222 |
bestCov = evals.get(2).coverage; |
223 |
} |
224 |
|
225 |
// print full result table |
226 |
if(outputFullResultTable) { |
227 |
// TODO: add gdt to native order by gdt (and number of models in header) |
228 |
System.out.printf("Results (ContactType=%s DistCutoff=%.1f AveragingThreshold=%s MinSeqSep=%d):\n", contactType, distCutoff, thresholdStr,minSeqSep); |
229 |
System.out.println(); |
230 |
System.out.print(" # "); PredEval.printHeaders(); |
231 |
int c = 1; |
232 |
for(PredEval e:evals) { |
233 |
if(e.title == targetTitle) System.out.println(); |
234 |
System.out.printf("%2d ",c++); |
235 |
e.printRow(); |
236 |
if(e.title == targetTitle) System.out.println(); |
237 |
} |
238 |
} |
239 |
|
240 |
// print accuracy and coverage of consensus graph and best model vs. target graph |
241 |
if(outputSingleLineResult) { |
242 |
System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\n",eval.accuracy, eval.coverage, bestAcc, bestCov); // picked_gdt, best_pred_gdt, best_templ_gdt |
243 |
} |
244 |
|
245 |
// write resulting consensus graph to file |
246 |
if(outputConsensusGraph) { |
247 |
try { |
248 |
consensusGraph.write_graph_to_file(outConsensusGraphFile); |
249 |
} catch (IOException e) { |
250 |
// TODO Auto-generated catch block |
251 |
e.printStackTrace(); |
252 |
} |
253 |
} |
254 |
|
255 |
// write weighted average graph to file |
256 |
if(outputAverageGraph) { |
257 |
try { |
258 |
averageGraph.write_graph_to_file(outAverageGraphFile); |
259 |
} catch (IOException e) { |
260 |
// TODO Auto-generated catch block |
261 |
e.printStackTrace(); |
262 |
} |
263 |
} |
264 |
|
265 |
// write target graph to file |
266 |
if(outputTargetGraph) { |
267 |
try{ |
268 |
targetGraph.write_graph_to_file(outTargetGraphFile); |
269 |
|
270 |
} catch (IOException e) { |
271 |
// TODO Auto-generated catch block |
272 |
e.printStackTrace(); |
273 |
} |
274 |
} |
275 |
|
276 |
// run reconstruction for consensus graph |
277 |
if(doReconstruct) { |
278 |
|
279 |
RIGraph[] graphs = {consensusGraph}; |
280 |
String sequence = targetGraph.getSequence(); |
281 |
|
282 |
TinkerRunner tr = null; |
283 |
try { |
284 |
tr = new TinkerRunner(TINKERBINDIR, PRMFILE); |
285 |
} catch (FileNotFoundException e3) { |
286 |
System.err.println("Couldn't find tinker bin dir "+TINKERBINDIR+". Exiting"); |
287 |
System.exit(1); |
288 |
} |
289 |
|
290 |
try { |
291 |
Pdb resultPdb = tr.reconstruct(sequence, graphs, numberOfTinkerModels); |
292 |
if(outputPredictedStructure) { |
293 |
resultPdb.dump2pdbfile(outStructFile); |
294 |
System.err.println("Output of reconstruction written to " + outStructFile); |
295 |
} |
296 |
} catch (IOException e) { |
297 |
System.err.println("Error while running Tinker reconstruction: " + e.getMessage()); |
298 |
} catch (TinkerError e) { |
299 |
System.err.println("Error while running Tinker reconstruction: " + e.getMessage()); |
300 |
} |
301 |
|
302 |
// benchmark 3D prediction |
303 |
|
304 |
// compare all tinker models with real |
305 |
double[] tinkerModelGdts = null; |
306 |
try { |
307 |
tinkerModelGdts = tr.getGdtsToNative(targetFileName, maxClusterExecutable); |
308 |
} catch(IOException e) { |
309 |
System.err.println("Error while calculating GDT scores: " + e.getMessage()); |
310 |
} |
311 |
// find best gdt from all tinker models |
312 |
double bestGdt = 0; |
313 |
//int bestModel = 0; |
314 |
for (int i = 0; i < tinkerModelGdts.length; i++) { |
315 |
if(tinkerModelGdts[i] > bestGdt) { |
316 |
//bestModel = i; |
317 |
bestGdt = tinkerModelGdts[i]; |
318 |
} |
319 |
} |
320 |
// get gdt of picked tinker model (picked by least bound violations) |
321 |
int pickedIdx = tr.pickByLeastBoundViols(); |
322 |
double pickedGdt = tinkerModelGdts[pickedIdx-1]; |
323 |
|
324 |
// get template gdt scores |
325 |
//double[] templateGdtScores = new double[templateFileNames.size()]; |
326 |
double bestTemplateGdt = 0; |
327 |
MaxClusterRunner maxCluster = null; |
328 |
try { |
329 |
maxCluster = new MaxClusterRunner(maxClusterExecutable); |
330 |
for(String tpl:templateFileNames) { |
331 |
double gdtScore = maxCluster.calculatePairwiseScore(tpl, targetFileName,MaxClusterRunner.ScoreType.GDT); |
332 |
if(gdtScore > bestTemplateGdt) { |
333 |
bestTemplateGdt = gdtScore; |
334 |
} |
335 |
} |
336 |
} catch(IOException e) { |
337 |
System.err.println("Error running maxcluster:" + e.getMessage()); |
338 |
} |
339 |
if(outputSingleLine3dResult) { |
340 |
// row output: best template gdt, best predicted gdt, picked predicted gdt |
341 |
System.out.printf("%6.3f\t%6.3f\t%6.3f\n", bestGdt, pickedGdt, bestTemplateGdt); |
342 |
} |
343 |
|
344 |
} |
345 |
} |
346 |
} |