1 |
package graphAveraging; |
2 |
|
3 |
import java.util.Date; |
4 |
import java.util.HashMap; |
5 |
import java.util.Iterator; |
6 |
import java.util.TreeMap; |
7 |
|
8 |
import proteinstructure.Alignment; |
9 |
import proteinstructure.AlignmentConstructionError; |
10 |
import proteinstructure.PairwiseSequenceAlignment; |
11 |
import proteinstructure.RIGEdge; |
12 |
import proteinstructure.RIGNode; |
13 |
import proteinstructure.RIGraph; |
14 |
import proteinstructure.PairwiseSequenceAlignment.PairwiseSequenceAlignmentException; |
15 |
|
16 |
import edu.uci.ics.jung.graph.util.EdgeType; |
17 |
import edu.uci.ics.jung.graph.util.Pair; |
18 |
|
19 |
|
20 |
/** |
21 |
* Provides methods for an ensemble of RIGs: Calculating a consensus graph, |
22 |
* and consensus scores for individual members or the whole ensemble. |
23 |
* @author stehr |
24 |
* @date 2007-12-19 |
25 |
*/ |
26 |
public class GraphAverager { |
27 |
|
28 |
/*--------------------------- member variables --------------------------*/ |
29 |
|
30 |
private Alignment al; |
31 |
private TreeMap<String,RIGraph> templateGraphs; // identified by tag-string |
32 |
private String targetTag; // id of target sequence in alignment |
33 |
private String sequence; // sequence of the final consensus graph |
34 |
private String contactType; // contact type of the final consensus graph |
35 |
private double distCutoff; // cutoff of the final consensus graph |
36 |
|
37 |
private HashMap<Pair<Integer>,Integer> contactVotes; |
38 |
|
39 |
/*----------------------------- constructors ----------------------------*/ |
40 |
|
41 |
/** |
42 |
* TODO: Create a graph averager for an ensemble of RIGs. |
43 |
*/ |
44 |
// public GraphAverager(RIGEnsemble rigs) { |
45 |
// - templateGraph = ensembleGraphs |
46 |
// - targetSequence = ensembleSequence |
47 |
// - targetTag = only used internally |
48 |
// - contactType/cutoff /numTemplates: from ensemble |
49 |
// } |
50 |
|
51 |
/** |
52 |
* Create a graph averager given an alignment of the target sequence to the template graphs. |
53 |
* @param sequence the target sequence |
54 |
* @param al a multiple alignment of the target sequence and the template graphs |
55 |
* @param templateGraphs a collection of template graphs to be averaged |
56 |
* @param targetTag the identifier of the target sequence in the alignment |
57 |
*/ |
58 |
public GraphAverager(String sequence, Alignment al, TreeMap<String,RIGraph> templateGraphs, String targetTag) { |
59 |
this.al = al; |
60 |
this.templateGraphs = templateGraphs; |
61 |
this.targetTag = targetTag; |
62 |
this.sequence = sequence; |
63 |
RIGraph firstGraph = templateGraphs.get(templateGraphs.firstKey()); |
64 |
this.contactType = firstGraph.getContactType(); |
65 |
this.distCutoff = firstGraph.getCutoff(); |
66 |
|
67 |
checkSequences(); |
68 |
countVotes(); // does the averaging by counting the votes and putting them into contactVotes |
69 |
} |
70 |
|
71 |
/** |
72 |
* Create a graph averager assuming that the target and template structures all have the same size. |
73 |
* A trivial alignment will be internally created. Fails if sizes do not match. |
74 |
* @param sequence the target sequence |
75 |
* @param templateGraphs a collection of template graphs to be averaged |
76 |
* @param targetTag the identifier of the target sequence in the alignment |
77 |
*/ |
78 |
public GraphAverager(String sequence, TreeMap<String,RIGraph> templateGraphs, String targetTag) { |
79 |
this.templateGraphs = templateGraphs; |
80 |
this.targetTag = targetTag; |
81 |
this.sequence = sequence; |
82 |
RIGraph firstGraph = templateGraphs.get(templateGraphs.firstKey()); |
83 |
this.contactType = firstGraph.getContactType(); |
84 |
this.distCutoff = firstGraph.getCutoff(); |
85 |
|
86 |
TreeMap<String,String> sequences = new TreeMap<String, String>(); |
87 |
sequences.put(Long.toString(new Date().getTime()), sequence); // a unique identifier |
88 |
for(String id:templateGraphs.keySet()) { |
89 |
sequences.put(id, templateGraphs.get(id).getSequence()); |
90 |
} |
91 |
|
92 |
// create trivial alignment |
93 |
try { |
94 |
this.al = new Alignment(sequences); |
95 |
} catch (AlignmentConstructionError e) { |
96 |
System.err.println("Could not create alignment: " + e.getMessage()); |
97 |
} |
98 |
|
99 |
|
100 |
checkSequences(); |
101 |
countVotes(); // does the averaging by counting the votes and putting them into contactVotes |
102 |
} |
103 |
|
104 |
/*---------------------------- private methods --------------------------*/ |
105 |
|
106 |
/** |
107 |
* Checks that tags and sequences are consistent between this.al and this.templateGraphs and between this.al and this.graph/this.targetTag |
108 |
* |
109 |
*/ |
110 |
private void checkSequences(){ |
111 |
if (!al.hasTag(targetTag)){ |
112 |
System.err.println("Alignment doesn't seem to contain the target sequence, check the FASTA tags"); |
113 |
//TODO throw exception |
114 |
} |
115 |
for (String tag:templateGraphs.keySet()){ |
116 |
if (!al.hasTag(tag)){ |
117 |
System.err.println("Alignment is missing template sequence "+tag+", check the FASTA tags"); |
118 |
// TODO throw exception |
119 |
} |
120 |
} |
121 |
if (templateGraphs.size()!=al.getNumberOfSequences()-1){ |
122 |
System.err.println("Number of sequences in alignment is different from number of templates +1 "); |
123 |
// TODO throw exception |
124 |
} |
125 |
if(!al.getSequenceNoGaps(targetTag).equals(this.sequence)) { |
126 |
System.err.println("Target sequence in alignment does not match sequence in target graph"); |
127 |
System.err.println("Trying to align sequences of alignment vs graph: "); |
128 |
try { |
129 |
PairwiseSequenceAlignment alCheck = new PairwiseSequenceAlignment(this.sequence,al.getSequenceNoGaps(targetTag),"graph","alignment"); |
130 |
alCheck.printAlignment(); |
131 |
} catch (PairwiseSequenceAlignmentException e) { |
132 |
System.err.println("Error while creating alignment check, can't display an alignment. The 2 sequences are: "); |
133 |
System.err.println("graph: "+sequence); |
134 |
System.err.println("alignment: "+al.getSequenceNoGaps(targetTag)); |
135 |
} |
136 |
// TODO throw exception |
137 |
} |
138 |
for (String tag:templateGraphs.keySet()){ |
139 |
if(!al.getSequenceNoGaps(tag).equals(templateGraphs.get(tag).getSequence())) { |
140 |
System.err.println("Sequence of template graph "+tag+" does not match sequence in alignment"); |
141 |
System.err.println("Trying to align sequences of alignment vs graph: "); |
142 |
try { |
143 |
PairwiseSequenceAlignment alCheck = new PairwiseSequenceAlignment(templateGraphs.get(tag).getSequence(),al.getSequenceNoGaps(tag),"graph","alignment"); |
144 |
alCheck.printAlignment(); |
145 |
} catch (PairwiseSequenceAlignmentException e) { |
146 |
System.err.println("Error while creating alignment check, can't display an alignment. The 2 sequences are: "); |
147 |
System.err.println("graph: "+templateGraphs.get(tag).getSequence()); |
148 |
System.err.println("alignment: "+al.getSequenceNoGaps(tag)); |
149 |
} |
150 |
// TODO throw exception |
151 |
} |
152 |
} |
153 |
} |
154 |
|
155 |
/** |
156 |
* Counts the votes for each possible alignment edge and puts all the votes in contactVotes Map |
157 |
* |
158 |
*/ |
159 |
private void countVotes() { |
160 |
|
161 |
contactVotes = new HashMap<Pair<Integer>, Integer>(); |
162 |
|
163 |
// we get the first graph in templates to see if they are directed or undirected |
164 |
boolean directed = templateGraphs.get(templateGraphs.firstKey()).isDirected(); |
165 |
|
166 |
// we go through all positions in the alignment |
167 |
for (int i=1; i<=al.getAlignmentLength(); i++){ |
168 |
for (int j=1; j<=al.getAlignmentLength(); j++) { |
169 |
|
170 |
if (directed) { |
171 |
// in directed case we only want to skip loop edges |
172 |
if (i==j) continue; |
173 |
} else { |
174 |
// in undirected case we want to skip half of the matrix |
175 |
// this way the final contactVotes Map will contain Pair<Integer> always with j>i |
176 |
if (i>=j) continue; |
177 |
} |
178 |
|
179 |
int vote = 0; |
180 |
// scanning all templates to see if they have this contact |
181 |
for (String tag:templateGraphs.keySet()){ |
182 |
RIGraph thisGraph = templateGraphs.get(tag); |
183 |
int iSeqIdx = al.al2seq(tag, i); |
184 |
int jSeqIdx = al.al2seq(tag, j); |
185 |
|
186 |
// if each of the ends map to a gap in this sequence we skip it |
187 |
if ((iSeqIdx!=-1) && (jSeqIdx!=-1) && thisGraph.containsEdgeIJ(iSeqIdx, jSeqIdx)) { |
188 |
RIGEdge thisGraphCont = thisGraph.getEdgeFromSerials(iSeqIdx, jSeqIdx); |
189 |
Pair<RIGNode> pair = thisGraph.getEndpoints(thisGraphCont); |
190 |
if (thisGraph.containsEdgeIJ(pair.getFirst().getResidueSerial(), pair.getSecond().getResidueSerial())) { |
191 |
vote++; |
192 |
} |
193 |
} |
194 |
} |
195 |
// putting vote in contactVotes Map |
196 |
if (vote>0){ |
197 |
contactVotes.put(new Pair<Integer>(i,j), vote); |
198 |
} |
199 |
} |
200 |
} |
201 |
} |
202 |
|
203 |
/*---------------------------- public methods ---------------------------*/ |
204 |
|
205 |
/** |
206 |
* Returns the number of templates. |
207 |
* @return |
208 |
*/ |
209 |
public int getNumberOfTemplates() { |
210 |
return this.templateGraphs.size(); |
211 |
} |
212 |
|
213 |
/** |
214 |
* Calculates the consensus graph from the set of template graphs. An edge is contained |
215 |
* in the consensus graph if the fractions of template graphs it is contained in is above |
216 |
* the given threshold. |
217 |
* The output is a new RIGraph object created from the given sequence in the constructor |
218 |
* to which we add the averaged edges. |
219 |
* @param threshold the threshold above which an edge is taken to be a consensus edge |
220 |
* @return the new graph |
221 |
*/ |
222 |
public RIGraph getConsensusGraph(double threshold) { |
223 |
|
224 |
RIGraph graph = new RIGraph(this.sequence); |
225 |
graph.setContactType(this.contactType); |
226 |
graph.setCutoff(this.distCutoff); |
227 |
int numTemplates = templateGraphs.size(); |
228 |
|
229 |
// if vote above threshold we take the contact for our target |
230 |
int voteThreshold = (int) Math.ceil((double)numTemplates*threshold); // i.e. round up of 50%, 40% or 30% (depends on threshold given) |
231 |
for (Pair<Integer> alignCont:contactVotes.keySet()){ |
232 |
if (contactVotes.get(alignCont)>=voteThreshold) { |
233 |
int target_i_res = al.al2seq(targetTag,alignCont.getFirst()); |
234 |
int target_j_res = al.al2seq(targetTag,alignCont.getSecond()); |
235 |
if (target_i_res!=-1 && target_j_res!=-1) { // we can't add contacts that map to gaps!! |
236 |
graph.addEdge(new RIGEdge(), graph.getNodeFromSerial(target_i_res), graph.getNodeFromSerial(target_j_res), EdgeType.UNDIRECTED); |
237 |
} |
238 |
} |
239 |
} |
240 |
return graph; |
241 |
} |
242 |
|
243 |
/** |
244 |
* Returns a RIGraph containing the union of edges of the template graphs weighted by the fraction of occurance in the templates. |
245 |
* The sequence of the graph is initalized to the sequence of the template. |
246 |
* @return |
247 |
*/ |
248 |
public RIGraph getAverageGraph() { |
249 |
RIGraph graph = new RIGraph(this.sequence); |
250 |
graph.setContactType(this.contactType); |
251 |
graph.setCutoff(this.distCutoff); |
252 |
int numTemplates = templateGraphs.size(); |
253 |
|
254 |
for (Pair<Integer> alignCont:contactVotes.keySet()){ |
255 |
double weight = 1.0 * contactVotes.get(alignCont) / numTemplates; |
256 |
int target_i_res = al.al2seq(targetTag,alignCont.getFirst()); |
257 |
int target_j_res = al.al2seq(targetTag,alignCont.getSecond()); |
258 |
if (target_i_res!=-1 && target_j_res!=-1) { // we can't add contacts that map to gaps!! |
259 |
graph.addEdge(new RIGEdge(weight), graph.getNodeFromSerial(target_i_res), graph.getNodeFromSerial(target_j_res), EdgeType.UNDIRECTED); |
260 |
} |
261 |
} |
262 |
return graph; |
263 |
} |
264 |
|
265 |
/** |
266 |
* Calculates the consensus score for the graph g with tag t with respect to the current ensemble of graphs. |
267 |
* g has to have the same size (i.e. number of nodes) as the graphs in the ensemble. |
268 |
* This is done by summing over each edge e in g, the number of graphs in the ensemble which also |
269 |
* contain edge e. This value can be normalized by the number of nodes and the size of the ensemble. |
270 |
* The normalization makes the values comparable across different targets. |
271 |
* This score is a rough estimate of model quality. |
272 |
* @param g2 |
273 |
* @return the consensus score or -1 if something went wrong |
274 |
*/ |
275 |
public double getConsensusScore(String t, boolean normalizeByNumNodes, boolean normalizeByNumTemplates) { |
276 |
double score = 0; |
277 |
RIGraph g = templateGraphs.get(t); |
278 |
if(g != null) { |
279 |
for(RIGEdge e:g.getEdges()) { |
280 |
// TODO: Use index mapping from alignment |
281 |
Pair<RIGNode> n = g.getEndpoints(e); |
282 |
Pair<Integer> i = new Pair<Integer>(n.getFirst().getResidueSerial(), n.getSecond().getResidueSerial()); |
283 |
int count = contactVotes.get(i); |
284 |
score = score + count; |
285 |
} |
286 |
} else return -1; |
287 |
if(normalizeByNumNodes) { |
288 |
score = score / al.getAlignmentLength(); |
289 |
} |
290 |
if(normalizeByNumTemplates) { |
291 |
score = score / templateGraphs.size(); |
292 |
} |
293 |
return score; |
294 |
} |
295 |
|
296 |
/** |
297 |
* Return the consensus score for the whole ensemble (summed over all members). |
298 |
* This is a rough estimate of target difficulty. |
299 |
* @return |
300 |
*/ |
301 |
public double getEnsembleConsensusScore() { |
302 |
double score = 0; |
303 |
for(String tag:templateGraphs.keySet()) { |
304 |
double s = getConsensusScore(tag, true, true); |
305 |
if(s > 0) score = score + s; |
306 |
} |
307 |
return score; |
308 |
} |
309 |
|
310 |
/** |
311 |
* Filters the graphs in this ensemble by the consensus score. All graphs |
312 |
* whith a consensus score equal to or greater than the given threshold are kept, |
313 |
* others are thrown away. |
314 |
* @param minScore the filtering threshold |
315 |
*/ |
316 |
public void filterByConsensusScore(double minScore) { |
317 |
// filter |
318 |
Iterator<String> it = templateGraphs.keySet().iterator(); |
319 |
while(it.hasNext()) { |
320 |
String tag = it.next(); |
321 |
double cs = getConsensusScore(tag, true, true); |
322 |
if(cs < minScore) { |
323 |
// delete from templateGraphs, keep in alignment (to avoid side effects) |
324 |
it.remove(); |
325 |
} |
326 |
} |
327 |
// update contact votes |
328 |
countVotes(); |
329 |
} |
330 |
|
331 |
// Run graph averaging from the command line |
332 |
public static void main(String[] args) { |
333 |
// Take a list of pdb/cm files, calculate the average & evtl. reconstruct using Tinker |
334 |
// 0. Create graphs (if necessary) |
335 |
// 1. select models (using overall consensus score) [or do this outside?] |
336 |
// 2. create consensus graph |
337 |
// 3. reconstruct (if necessary) |
338 |
|
339 |
// define getopt options |
340 |
} |
341 |
} |