ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/graphAveraging/GraphAverager.java
Revision: 537
Committed: Wed Feb 13 18:29:05 2008 UTC (16 years, 7 months ago) by duarte
File size: 13218 byte(s)
Log Message:
Fixed another bug in GraphAverager. Was still not working correctly: was putting edges twice in the final graph. 
Also now handling correctly graph averaging for directed or undirected graphs.
Line File contents
1 package graphAveraging;
2
3 import java.util.Date;
4 import java.util.HashMap;
5 import java.util.Iterator;
6 import java.util.TreeMap;
7
8 import proteinstructure.Alignment;
9 import proteinstructure.AlignmentConstructionError;
10 import proteinstructure.PairwiseSequenceAlignment;
11 import proteinstructure.RIGEdge;
12 import proteinstructure.RIGNode;
13 import proteinstructure.RIGraph;
14 import proteinstructure.PairwiseSequenceAlignment.PairwiseSequenceAlignmentException;
15
16 import edu.uci.ics.jung.graph.util.EdgeType;
17 import edu.uci.ics.jung.graph.util.Pair;
18
19
20 /**
21 * Provides methods for an ensemble of RIGs: Calculating a consensus graph,
22 * and consensus scores for individual members or the whole ensemble.
23 * @author stehr
24 * @date 2007-12-19
25 */
26 public class GraphAverager {
27
28 /*--------------------------- member variables --------------------------*/
29
30 private Alignment al;
31 private TreeMap<String,RIGraph> templateGraphs; // identified by tag-string
32 private String targetTag; // id of target sequence in alignment
33 private String sequence; // sequence of the final consensus graph
34 private String contactType; // contact type of the final consensus graph
35 private double distCutoff; // cutoff of the final consensus graph
36
37 private HashMap<Pair<Integer>,Integer> contactVotes;
38
39 /*----------------------------- constructors ----------------------------*/
40
41 /**
42 * TODO: Create a graph averager for an ensemble of RIGs.
43 */
44 // public GraphAverager(RIGEnsemble rigs) {
45 // - templateGraph = ensembleGraphs
46 // - targetSequence = ensembleSequence
47 // - targetTag = only used internally
48 // - contactType/cutoff /numTemplates: from ensemble
49 // }
50
51 /**
52 * Create a graph averager given an alignment of the target sequence to the template graphs.
53 * @param sequence the target sequence
54 * @param al a multiple alignment of the target sequence and the template graphs
55 * @param templateGraphs a collection of template graphs to be averaged
56 * @param targetTag the identifier of the target sequence in the alignment
57 */
58 public GraphAverager(String sequence, Alignment al, TreeMap<String,RIGraph> templateGraphs, String targetTag) {
59 this.al = al;
60 this.templateGraphs = templateGraphs;
61 this.targetTag = targetTag;
62 this.sequence = sequence;
63 RIGraph firstGraph = templateGraphs.get(templateGraphs.firstKey());
64 this.contactType = firstGraph.getContactType();
65 this.distCutoff = firstGraph.getCutoff();
66
67 checkSequences();
68 countVotes(); // does the averaging by counting the votes and putting them into contactVotes
69 }
70
71 /**
72 * Create a graph averager assuming that the target and template structures all have the same size.
73 * A trivial alignment will be internally created. Fails if sizes do not match.
74 * @param sequence the target sequence
75 * @param templateGraphs a collection of template graphs to be averaged
76 * @param targetTag the identifier of the target sequence in the alignment
77 */
78 public GraphAverager(String sequence, TreeMap<String,RIGraph> templateGraphs, String targetTag) {
79 this.templateGraphs = templateGraphs;
80 this.targetTag = targetTag;
81 this.sequence = sequence;
82 RIGraph firstGraph = templateGraphs.get(templateGraphs.firstKey());
83 this.contactType = firstGraph.getContactType();
84 this.distCutoff = firstGraph.getCutoff();
85
86 TreeMap<String,String> sequences = new TreeMap<String, String>();
87 sequences.put(Long.toString(new Date().getTime()), sequence); // a unique identifier
88 for(String id:templateGraphs.keySet()) {
89 sequences.put(id, templateGraphs.get(id).getSequence());
90 }
91
92 // create trivial alignment
93 try {
94 this.al = new Alignment(sequences);
95 } catch (AlignmentConstructionError e) {
96 System.err.println("Could not create alignment: " + e.getMessage());
97 }
98
99
100 checkSequences();
101 countVotes(); // does the averaging by counting the votes and putting them into contactVotes
102 }
103
104 /*---------------------------- private methods --------------------------*/
105
106 /**
107 * Checks that tags and sequences are consistent between this.al and this.templateGraphs and between this.al and this.graph/this.targetTag
108 *
109 */
110 private void checkSequences(){
111 if (!al.hasTag(targetTag)){
112 System.err.println("Alignment doesn't seem to contain the target sequence, check the FASTA tags");
113 //TODO throw exception
114 }
115 for (String tag:templateGraphs.keySet()){
116 if (!al.hasTag(tag)){
117 System.err.println("Alignment is missing template sequence "+tag+", check the FASTA tags");
118 // TODO throw exception
119 }
120 }
121 if (templateGraphs.size()!=al.getNumberOfSequences()-1){
122 System.err.println("Number of sequences in alignment is different from number of templates +1 ");
123 // TODO throw exception
124 }
125 if(!al.getSequenceNoGaps(targetTag).equals(this.sequence)) {
126 System.err.println("Target sequence in alignment does not match sequence in target graph");
127 System.err.println("Trying to align sequences of alignment vs graph: ");
128 try {
129 PairwiseSequenceAlignment alCheck = new PairwiseSequenceAlignment(this.sequence,al.getSequenceNoGaps(targetTag),"graph","alignment");
130 alCheck.printAlignment();
131 } catch (PairwiseSequenceAlignmentException e) {
132 System.err.println("Error while creating alignment check, can't display an alignment. The 2 sequences are: ");
133 System.err.println("graph: "+sequence);
134 System.err.println("alignment: "+al.getSequenceNoGaps(targetTag));
135 }
136 // TODO throw exception
137 }
138 for (String tag:templateGraphs.keySet()){
139 if(!al.getSequenceNoGaps(tag).equals(templateGraphs.get(tag).getSequence())) {
140 System.err.println("Sequence of template graph "+tag+" does not match sequence in alignment");
141 System.err.println("Trying to align sequences of alignment vs graph: ");
142 try {
143 PairwiseSequenceAlignment alCheck = new PairwiseSequenceAlignment(templateGraphs.get(tag).getSequence(),al.getSequenceNoGaps(tag),"graph","alignment");
144 alCheck.printAlignment();
145 } catch (PairwiseSequenceAlignmentException e) {
146 System.err.println("Error while creating alignment check, can't display an alignment. The 2 sequences are: ");
147 System.err.println("graph: "+templateGraphs.get(tag).getSequence());
148 System.err.println("alignment: "+al.getSequenceNoGaps(tag));
149 }
150 // TODO throw exception
151 }
152 }
153 }
154
155 /**
156 * Counts the votes for each possible alignment edge and puts all the votes in contactVotes Map
157 *
158 */
159 private void countVotes() {
160
161 contactVotes = new HashMap<Pair<Integer>, Integer>();
162
163 // we get the first graph in templates to see if they are directed or undirected
164 boolean directed = templateGraphs.get(templateGraphs.firstKey()).isDirected();
165
166 // we go through all positions in the alignment
167 for (int i=1; i<=al.getAlignmentLength(); i++){
168 for (int j=1; j<=al.getAlignmentLength(); j++) {
169
170 if (directed) {
171 // in directed case we only want to skip loop edges
172 if (i==j) continue;
173 } else {
174 // in undirected case we want to skip half of the matrix
175 // this way the final contactVotes Map will contain Pair<Integer> always with j>i
176 if (i>=j) continue;
177 }
178
179 int vote = 0;
180 // scanning all templates to see if they have this contact
181 for (String tag:templateGraphs.keySet()){
182 RIGraph thisGraph = templateGraphs.get(tag);
183 int iSeqIdx = al.al2seq(tag, i);
184 int jSeqIdx = al.al2seq(tag, j);
185
186 // if each of the ends map to a gap in this sequence we skip it
187 if ((iSeqIdx!=-1) && (jSeqIdx!=-1) && thisGraph.containsEdgeIJ(iSeqIdx, jSeqIdx)) {
188 RIGEdge thisGraphCont = thisGraph.getEdgeFromSerials(iSeqIdx, jSeqIdx);
189 Pair<RIGNode> pair = thisGraph.getEndpoints(thisGraphCont);
190 if (thisGraph.containsEdgeIJ(pair.getFirst().getResidueSerial(), pair.getSecond().getResidueSerial())) {
191 vote++;
192 }
193 }
194 }
195 // putting vote in contactVotes Map
196 if (vote>0){
197 contactVotes.put(new Pair<Integer>(i,j), vote);
198 }
199 }
200 }
201 }
202
203 /*---------------------------- public methods ---------------------------*/
204
205 /**
206 * Returns the number of templates.
207 * @return
208 */
209 public int getNumberOfTemplates() {
210 return this.templateGraphs.size();
211 }
212
213 /**
214 * Calculates the consensus graph from the set of template graphs. An edge is contained
215 * in the consensus graph if the fractions of template graphs it is contained in is above
216 * the given threshold.
217 * The output is a new RIGraph object created from the given sequence in the constructor
218 * to which we add the averaged edges.
219 * @param threshold the threshold above which an edge is taken to be a consensus edge
220 * @return the new graph
221 */
222 public RIGraph getConsensusGraph(double threshold) {
223
224 RIGraph graph = new RIGraph(this.sequence);
225 graph.setContactType(this.contactType);
226 graph.setCutoff(this.distCutoff);
227 int numTemplates = templateGraphs.size();
228
229 // if vote above threshold we take the contact for our target
230 int voteThreshold = (int) Math.ceil((double)numTemplates*threshold); // i.e. round up of 50%, 40% or 30% (depends on threshold given)
231 for (Pair<Integer> alignCont:contactVotes.keySet()){
232 if (contactVotes.get(alignCont)>=voteThreshold) {
233 int target_i_res = al.al2seq(targetTag,alignCont.getFirst());
234 int target_j_res = al.al2seq(targetTag,alignCont.getSecond());
235 if (target_i_res!=-1 && target_j_res!=-1) { // we can't add contacts that map to gaps!!
236 graph.addEdge(new RIGEdge(), graph.getNodeFromSerial(target_i_res), graph.getNodeFromSerial(target_j_res), EdgeType.UNDIRECTED);
237 }
238 }
239 }
240 return graph;
241 }
242
243 /**
244 * Returns a RIGraph containing the union of edges of the template graphs weighted by the fraction of occurance in the templates.
245 * The sequence of the graph is initalized to the sequence of the template.
246 * @return
247 */
248 public RIGraph getAverageGraph() {
249 RIGraph graph = new RIGraph(this.sequence);
250 graph.setContactType(this.contactType);
251 graph.setCutoff(this.distCutoff);
252 int numTemplates = templateGraphs.size();
253
254 for (Pair<Integer> alignCont:contactVotes.keySet()){
255 double weight = 1.0 * contactVotes.get(alignCont) / numTemplates;
256 int target_i_res = al.al2seq(targetTag,alignCont.getFirst());
257 int target_j_res = al.al2seq(targetTag,alignCont.getSecond());
258 if (target_i_res!=-1 && target_j_res!=-1) { // we can't add contacts that map to gaps!!
259 graph.addEdge(new RIGEdge(weight), graph.getNodeFromSerial(target_i_res), graph.getNodeFromSerial(target_j_res), EdgeType.UNDIRECTED);
260 }
261 }
262 return graph;
263 }
264
265 /**
266 * Calculates the consensus score for the graph g with tag t with respect to the current ensemble of graphs.
267 * g has to have the same size (i.e. number of nodes) as the graphs in the ensemble.
268 * This is done by summing over each edge e in g, the number of graphs in the ensemble which also
269 * contain edge e. This value can be normalized by the number of nodes and the size of the ensemble.
270 * The normalization makes the values comparable across different targets.
271 * This score is a rough estimate of model quality.
272 * @param g2
273 * @return the consensus score or -1 if something went wrong
274 */
275 public double getConsensusScore(String t, boolean normalizeByNumNodes, boolean normalizeByNumTemplates) {
276 double score = 0;
277 RIGraph g = templateGraphs.get(t);
278 if(g != null) {
279 for(RIGEdge e:g.getEdges()) {
280 // TODO: Use index mapping from alignment
281 Pair<RIGNode> n = g.getEndpoints(e);
282 Pair<Integer> i = new Pair<Integer>(n.getFirst().getResidueSerial(), n.getSecond().getResidueSerial());
283 int count = contactVotes.get(i);
284 score = score + count;
285 }
286 } else return -1;
287 if(normalizeByNumNodes) {
288 score = score / al.getAlignmentLength();
289 }
290 if(normalizeByNumTemplates) {
291 score = score / templateGraphs.size();
292 }
293 return score;
294 }
295
296 /**
297 * Return the consensus score for the whole ensemble (summed over all members).
298 * This is a rough estimate of target difficulty.
299 * @return
300 */
301 public double getEnsembleConsensusScore() {
302 double score = 0;
303 for(String tag:templateGraphs.keySet()) {
304 double s = getConsensusScore(tag, true, true);
305 if(s > 0) score = score + s;
306 }
307 return score;
308 }
309
310 /**
311 * Filters the graphs in this ensemble by the consensus score. All graphs
312 * whith a consensus score equal to or greater than the given threshold are kept,
313 * others are thrown away.
314 * @param minScore the filtering threshold
315 */
316 public void filterByConsensusScore(double minScore) {
317 // filter
318 Iterator<String> it = templateGraphs.keySet().iterator();
319 while(it.hasNext()) {
320 String tag = it.next();
321 double cs = getConsensusScore(tag, true, true);
322 if(cs < minScore) {
323 // delete from templateGraphs, keep in alignment (to avoid side effects)
324 it.remove();
325 }
326 }
327 // update contact votes
328 countVotes();
329 }
330
331 // Run graph averaging from the command line
332 public static void main(String[] args) {
333 // Take a list of pdb/cm files, calculate the average & evtl. reconstruct using Tinker
334 // 0. Create graphs (if necessary)
335 // 1. select models (using overall consensus score) [or do this outside?]
336 // 2. create consensus graph
337 // 3. reconstruct (if necessary)
338
339 // define getopt options
340 }
341 }