ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/embed/BoundsSmoother.java
Revision: 823
Committed: Thu Jan 15 16:09:59 2009 UTC (15 years, 8 months ago) by duarte
File size: 18736 byte(s)
Log Message:
Now printing restrains viols and bounds viols.
Line User Rev File contents
1 duarte 796 package embed;
2    
3 duarte 797 import java.util.ArrayList;
4     import java.util.Collections;
5     import java.util.HashMap;
6 duarte 796 import java.util.Random;
7    
8     import org.apache.commons.collections15.Transformer;
9    
10     import Jama.Matrix;
11    
12     import edu.uci.ics.jung.algorithms.shortestpath.DijkstraDistance;
13 duarte 797 import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath;
14 duarte 796 import edu.uci.ics.jung.graph.SparseGraph;
15     import edu.uci.ics.jung.graph.util.EdgeType;
16     import proteinstructure.AAinfo;
17    
18     /**
19     * Implementation of the bounds smoothing part of the EMBED algorithm of Crippen and Havel
20     * Given a sparse set of distance ranges between a set of atoms it finds distance bounds for
21     * all pairs of atoms, using the triangle inequality.
22     *
23     * Taken from "Distance Geometry: Theory, Algorithms, and Chemical Applications" (section 3.1) by T.F. Havel,
24     * in Encyclopedia of Computational Chemistry (Wiley, New York, 1998).
25 duarte 804 * See also:
26     * "Distance Geometry and Molecular Conformation" (Chapter 5) by G.M. Crippen and T.F. Havel (Wiley)
27     * "Sampling and efficiency of metric matrix distance geometry: A novel partial
28     * metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
29 duarte 796 *
30     * @author duarte
31     *
32     */
33     public class BoundsSmoother {
34    
35 duarte 799 private static final double HARD_SPHERES_BOUND = AAinfo.DIST_MIN_CA ;
36 duarte 800
37     private static final boolean DEBUG = false;
38 duarte 802 private static final long DEBUG_SEED = 123456;
39    
40 duarte 804 private static final int NUM_ROOTS_PARTIAL_METRIZATION = 4; // we choose 4 as in Kuszewski et al.
41 duarte 796
42     /*----------------- helper classes and transformers -----------------*/
43    
44 duarte 797 Transformer<SimpleEdge, Number> WeightTransformer = new Transformer<SimpleEdge, Number>() {
45     public Number transform(SimpleEdge input) {
46 duarte 796 return input.weight;
47     }
48     };
49    
50     private class SimpleEdge {
51     public double weight;
52     public SimpleEdge(double weight) {
53     this.weight = weight;
54     }
55     }
56 duarte 797
57     private class BoundsDigraphNode {
58     public static final boolean LEFT = true;
59     public static final boolean RIGHT = false;
60     private boolean side; // true: left, false: right
61     private int resSerial;
62     public BoundsDigraphNode(int resSerial, boolean side) {
63     this.resSerial = resSerial;
64     this.side = side;
65     }
66     public boolean isRight() {
67     return !side;
68     }
69     public boolean isLeft() {
70     return side;
71     }
72     public int getSerial() {
73     return resSerial;
74     }
75     public boolean equals(Object other) {
76     if (! (other instanceof BoundsDigraphNode)) return false;
77     BoundsDigraphNode otherNode = (BoundsDigraphNode)other;
78     if (otherNode.resSerial==this.resSerial && otherNode.side == this.side) {
79     return true;
80     } else {
81     return false;
82     }
83     }
84     public String toString() {
85     return resSerial+(side?"L":"R");
86     }
87     }
88 duarte 796
89     /*---------------------- member variables ----------------------------*/
90    
91     private int conformationSize;
92 duarte 797 private HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>> nodesBoundsDigraph; // map of serial/side to nodes in the bounds digraph
93     private double lmax; // maximum of the lower bounds: offset value for the boundsDigraph (not to have negative weights so that we can use Dijkstra's algo)
94 duarte 796
95 duarte 804 private Bound[][] bounds; // the bounds (half-)matrix with lower/upper bounds for all pairs (or with nulls for pairs without assigned bounds yet)
96    
97 duarte 802 private Random rand; // the random generator for sampleBounds and metrize
98    
99 duarte 796 /*------------------------ constructors ------------------------------*/
100    
101     /**
102 duarte 822 * Constructs a new BoundsSmoother object given a matrix of Bounds.
103 duarte 796 * @param graph
104     */
105 duarte 821 public BoundsSmoother(Bound[][] bounds) {
106     this.bounds = bounds;
107     this.conformationSize = bounds.length;
108 duarte 796 }
109    
110     /*----------------------- public methods ----------------------------*/
111    
112     /**
113     * Computes bounds for all pairs, based on the set of sparse distance ranges
114 duarte 804 * The returned array is a new array not the reference to the internal bounds array.
115 duarte 822 * @return a 2-dimensional array with the lower bounds for all pairs of residues, the
116     * indices of the array are guaranteed to be in the same order as the residue serials.
117 duarte 796 */
118     public Bound[][] getBoundsAllPairs() {
119 duarte 804 computeTriangleInequality();
120     return copyBounds(bounds); // we return a copy of the internal bounds array so that we can keep modifying it without side-effects to the returned reference
121 duarte 796 }
122 duarte 798
123 duarte 796 /**
124 duarte 821 * Gets a random sample from the internal bounds array of all pairs of distance ranges
125     * @return a symmetric metric matrix (both sides filled)
126     * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
127 duarte 796 */
128 duarte 804 public Matrix sampleBounds() {
129 duarte 821 return sampleBounds(true);
130 duarte 796 }
131 duarte 821
132 duarte 803 /**
133 duarte 804 * Performs partial metrization for the internal bounds array.
134     * The internal bounds array is updated with the new bounds after metrization.
135     * The idea is that metrization doesn't need to be done for all atoms but only
136     * for a handful of them (called roots). This results in a much faster algorithm having
137     * almost the same sampling properties as full metrization.
138     * See "Sampling and efficiency of metric matrix distance geometry: A novel partial
139     * metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
140 duarte 803 * @return
141     */
142 duarte 804 public Matrix metrize() {
143 duarte 803
144 duarte 821 initSeed();
145    
146 duarte 802 ArrayList<Integer> roots = new ArrayList<Integer>();
147     for (int count=1;count<=NUM_ROOTS_PARTIAL_METRIZATION;count++) {
148     int root = rand.nextInt(conformationSize);
149 duarte 804 if (roots.contains(root)) {
150     root = rand.nextInt(conformationSize);
151     }
152     if (roots.contains(root)) {
153     System.err.println("Warning: repeated root atom while doing metrization: "+root);
154     }
155 duarte 803 if (DEBUG) System.out.println("Picked root: "+root);
156 duarte 802 roots.add(root);
157 duarte 804 sampleBoundForRoot(root); // this alters directly the input bounds array
158     computeTriangleInequality();
159 duarte 802 }
160    
161 duarte 803 // finally pick a value at random for all the other bounds
162 duarte 821 return sampleBounds(false);
163 duarte 802 }
164    
165 duarte 823 /**
166     * Gets a reference to the current bounds matrix
167     * @return
168     */
169     protected Bound[][] getBounds() {
170     return bounds;
171     }
172    
173 duarte 821 /*----------------------- private methods ---------------------------*/
174    
175 duarte 796 /**
176 duarte 821 * Initialises (or resets the random seed).
177     * If DEBUG flag is true the random seed will be a fixed value DEBUG_SEED
178 duarte 796 */
179 duarte 821 private void initSeed() {
180     if (DEBUG) {
181     rand = new Random(DEBUG_SEED);
182     } else {
183     rand = new Random();
184     }
185 duarte 796 }
186    
187 duarte 804 /**
188 duarte 821 * Gets a random sample from the internal bounds array of all pairs distance ranges
189     * @param initSeed if true the random seed is reinitialised, if false the existing one will be used
190     * @return a symmetric metric matrix (both sides filled)
191     * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
192     */
193     private Matrix sampleBounds(boolean initSeed) {
194     if (initSeed) {
195     initSeed();
196     }
197     double[][] matrix = new double[conformationSize][conformationSize];
198     for (int i=0;i<conformationSize;i++) {
199     for (int j=0;j<conformationSize;j++) {
200     if (j>i) {
201     matrix[i][j]= bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
202     } else if (j<i) {
203     matrix[i][j]=matrix[j][i];
204     }
205     }
206     }
207     return new Matrix(matrix);
208     }
209    
210     /**
211 duarte 804 * For given root atom samples a value from the distance ranges of the root
212     * to all of its neighbours (updating the internal bounds array with the new sampled bounds)
213     * @param root
214     */
215     private void sampleBoundForRoot(int root) {
216 duarte 803 for (int neighb=0;neighb<conformationSize;neighb++) {
217     if (neighb==root) continue; // avoid the diagonal (which contains a null Bound)
218 duarte 802 int i = root;
219 duarte 803 int j = neighb;
220     if (neighb<root) {
221     i = neighb;
222 duarte 802 j = root;
223     }
224     double sampledValue = bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
225 duarte 803 bounds[i][j].lower = sampledValue;
226     bounds[i][j].upper = sampledValue;
227     if (DEBUG) System.out.print(bounds[i][j]);
228 duarte 802 }
229 duarte 803 if (DEBUG) System.out.println("\n");
230 duarte 802 }
231    
232 duarte 796 /**
233 duarte 804 * Computes bounds for all pairs through triangle inequalities from the bounds member variable
234     * containing a set of lower/upper bounds (sparse or full)
235     * The bounds array is modified to contain the new bounds.
236     * This is guaranteed to run in o(nmlog(n)), thus if the input is sparse then it's pretty fast: o(n2log(n)) but if
237     * the input is the full set of bounds then we have o(n3log(n)).
238 duarte 798 */
239 duarte 804 private void computeTriangleInequality() {
240 duarte 803 double MARGIN = 0.0001; // for comparing doubles we need some tolerance value
241 duarte 804
242     double[][] lowerBounds = getLowerBoundsAllPairs();
243     double[][] upperBounds = getUpperBoundsAllPairs();
244 duarte 802 for (int i=0;i<conformationSize;i++) {
245     for (int j=i+1;j<conformationSize;j++) {
246 duarte 798 double upperBound = upperBounds[i][j];
247     double lowerBound = lowerBounds[i][j];
248 duarte 804
249     if (bounds[i][j]!=null) {
250     if (lowerBound>upperBound+MARGIN) {
251     //System.err.println("old: "+sparseBounds[i][j]+" new: "+new Bound(lowerBound,upperBound));
252    
253     // During metrization sometimes a new upper bound is found that is below the new lower bound
254     // (actually in these cases the new lower bound coincides with the old one i.e. nothing new was
255     // found through triangle inequality for the lower bound).
256     // For some reason it doesn't happen the other way around: a new lower bound found that is
257     // above the new (coinciding with old) upper bound. I suppose this is because the triangle inequality
258     // "is a lot more effective at reducing the upper bounds than increasing the lower bounds" (quoting Havel)
259     // To correct this we set both lower and upper to the newly found upper, i.e. we assume that
260     // the new upper bound is better because is in accordance to the triangle inequality
261     lowerBound=upperBound;
262    
263     //if (upperBound<sparseBounds[i][j].upper-MARGIN)
264     // System.err.printf("new upper bound (%4.1f) for pair %3d %3d is smaller than old upper bound (%4.1f)\n",upperBound,i,j,sparseBounds[i][j].upper);
265     //if (lowerBound>sparseBounds[i][j].lower+MARGIN)
266     // System.err.printf("new lower bound (%4.1f) for pair %3d %3d is bigger than old lower bound (%4.1f)\n",lowerBound,i,j,sparseBounds[i][j].lower);
267     }
268    
269     bounds[i][j].lower = lowerBound;
270     bounds[i][j].upper = upperBound;
271     } else {
272     bounds[i][j] = new Bound(lowerBound,upperBound);
273 duarte 803 }
274 duarte 802 // sanity check: lower bounds can't be bigger than upper bounds!
275 duarte 803 if (lowerBound>upperBound+MARGIN) {
276 duarte 802 System.err.printf("Warning: lower bound (%4.1f) for pair "+i+" "+j+" is bigger than upper bound (%4.1f)\n",lowerBound,upperBound);
277 duarte 798 }
278     }
279     }
280     }
281    
282     /**
283 duarte 804 * Computes upper bounds for all pairs from a set of upper bounds (sparse or full) based
284 duarte 796 * on the triangle inequality.
285     * The computation is actually just an all pairs shortest path using Dijkstra's
286     * shortest path algorithm (as the set of distance coming from contact maps is
287     * very sparse this algorithm should be more efficient than Floyd's: Dijkstra's is
288     * o(nm logn) and Floyd's is o(n3))
289 duarte 822 * @return a 2-dimensional array with the lower bounds for all pairs of residues, the
290     * indices of the array are guaranteed to be in the same order as the residue serials.
291 duarte 796 */
292 duarte 804 private double[][] getUpperBoundsAllPairs() {
293 duarte 796 double[][] upperBoundsMatrix = new double[conformationSize][conformationSize];
294 duarte 804 SparseGraph<Integer,SimpleEdge> upperBoundGraph = convertBoundsMatrixToUpperBoundGraph();
295 duarte 796 DijkstraDistance<Integer, SimpleEdge> dd = new DijkstraDistance<Integer, SimpleEdge>(upperBoundGraph,WeightTransformer);
296 duarte 802
297     for (int i=0;i<conformationSize;i++) {
298     for (int j=i+1;j<conformationSize;j++) {
299     upperBoundsMatrix[i][j] = dd.getDistance(i, j).doubleValue();
300 duarte 796 }
301     }
302     return upperBoundsMatrix;
303     }
304    
305     /**
306 duarte 804 * Computes lower bounds for all pairs given a set of upper/lower bounds (sparse or full)
307     * based on the triangle inequality.
308 duarte 796 *
309 duarte 797 * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
310     * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Because of this the boundsDigraph result
311 duarte 804 * of calling {@link #convertBoundsMatrixToBoundsDigraph()}} have values offset with the maximum lower bound (lmax).
312 duarte 797 * Thus after computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
313     *
314 duarte 822 * @return a 2-dimensional array with the lower bounds for all pairs of residues, the
315     * indices of the array are guaranteed to be in the same order as the residue serials.
316 duarte 804 * @see {@link #convertBoundsMatrixToBoundsDigraph()} and {@link #getUpperBoundsAllPairs()}
317 duarte 796 */
318 duarte 804 private double[][] getLowerBoundsAllPairs() {
319 duarte 797 double[][] lowerBoundsMatrix = new double[conformationSize][conformationSize];
320     // this is the bounds digraph as described by Crippen and Havel
321 duarte 804 SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = convertBoundsMatrixToBoundsDigraph();
322 duarte 797 DijkstraShortestPath<BoundsDigraphNode, SimpleEdge> dd = new DijkstraShortestPath<BoundsDigraphNode, SimpleEdge>(boundsDigraph,WeightTransformer);
323 duarte 802
324     for (int i=0;i<conformationSize;i++) {
325     for (int j=i+1;j<conformationSize;j++) {
326     int hops = dd.getPath(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)).size();
327     double lower = Math.abs(
328 duarte 797 (dd.getDistance(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
329 duarte 802 nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)
330     ).doubleValue())
331 duarte 797 - (hops*lmax)); // the lower limit for the triangle inequality is: Math.abs(shortestpath-(hops*lmax))
332 duarte 802 lowerBoundsMatrix[i][j] = Math.max(lower, HARD_SPHERES_BOUND); // we only set the new lower bound to the one found if is above the HARD_SPHERES_BOUND
333 duarte 796 }
334     }
335 duarte 797 return lowerBoundsMatrix;
336 duarte 796
337     }
338    
339     /**
340 duarte 802 * Converts the bounds matrix to a graph with only the upper bounds: nodes indices of bounds
341     * matrix, edges upper bounds (in SimpleEdge objects containing the upper bound value
342 duarte 796 * in their weight field).
343 duarte 802 * @param bounds
344 duarte 796 * @return
345     */
346 duarte 804 private SparseGraph<Integer,SimpleEdge> convertBoundsMatrixToUpperBoundGraph() {
347 duarte 796 SparseGraph<Integer,SimpleEdge> upperBoundGraph = new SparseGraph<Integer, SimpleEdge>();
348 duarte 802 for (int i=0;i<conformationSize;i++) {
349     for (int j=0;j<conformationSize;j++) {
350     if (bounds[i][j]!=null) {
351     upperBoundGraph.addEdge(new SimpleEdge(bounds[i][j].upper), i, j, EdgeType.UNDIRECTED);
352     }
353     }
354 duarte 796 }
355     return upperBoundGraph;
356     }
357 duarte 797
358 duarte 796 /**
359 duarte 797 * Constructs a bounds digraph (as described by Crippen and Havel) to compute the triangle inequality limits
360     * for the lower bounds.
361     * The graph is composed by 2 subgraphs (we call them left and right) each of them containing the set of all atoms.
362     * Within the subgraphs there is an undirected edge between atoms i,j with weight the upper bounds for i,j
363     * Between the subgraphs there is a directed edge from left to right between atoms i(left) to j(right)
364     * with weight the negative of the lower bound i,j
365     * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
366     * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Thus we offset the values here to the maximum lower bound.
367     * After computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
368     * @return
369     */
370 duarte 804 private SparseGraph<BoundsDigraphNode,SimpleEdge> convertBoundsMatrixToBoundsDigraph() {
371 duarte 797 // to do the offset thing (see docs above) we need to know first of all the max lower bound
372     ArrayList<Double> lowerBounds = new ArrayList<Double>();
373 duarte 802 for (int i=0;i<conformationSize;i++) {
374     for (int j=0;j<conformationSize;j++) {
375     if (bounds[i][j]!=null) {
376     lowerBounds.add(bounds[i][j].lower);
377     }
378     }
379 duarte 797 }
380     lmax = Collections.max(lowerBounds); // this is the offset value
381    
382     SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = new SparseGraph<BoundsDigraphNode, SimpleEdge>();
383     // we have to store all nodes in a HashMap, so we can retrieve them by residue serial and side after
384     nodesBoundsDigraph = new HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>>();
385     nodesBoundsDigraph.put(BoundsDigraphNode.LEFT , new HashMap<Integer, BoundsDigraphNode>());
386     nodesBoundsDigraph.put(BoundsDigraphNode.RIGHT, new HashMap<Integer, BoundsDigraphNode>());
387     // first we create the nodes and store them into the HashMap
388 duarte 802 for (int i=0;i<conformationSize;i++) {
389 duarte 797 BoundsDigraphNode leftNode = new BoundsDigraphNode(i, BoundsDigraphNode.LEFT);
390     BoundsDigraphNode rightNode = new BoundsDigraphNode(i, BoundsDigraphNode.RIGHT);
391     boundsDigraph.addVertex(leftNode);
392     boundsDigraph.addVertex(rightNode);
393     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).put(i,leftNode);
394     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).put(i,rightNode);
395     }
396    
397 duarte 802 for (int i=0;i<conformationSize;i++) {
398     for (int j=0;j<conformationSize;j++) {
399     if (bounds[i][j]!=null) {
400     // first we add the upper bounds as undirected edges to the 2 subgraphs (left and right)
401     boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
402     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
403     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(j),
404     EdgeType.UNDIRECTED);
405     boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
406     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(i),
407     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
408     EdgeType.UNDIRECTED);
409     // then we add the negative of the lower bounds as directed edges connecting nodes of subgraph left to subgraph right
410     boundsDigraph.addEdge(new SimpleEdge(lmax-bounds[i][j].lower),
411     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
412     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
413     EdgeType.DIRECTED);
414     }
415     }
416 duarte 797 }
417     return boundsDigraph;
418     }
419    
420 duarte 821 protected void printBounds() {
421 duarte 804 printBounds(this.bounds);
422     }
423 duarte 802
424 duarte 804 /*------------------------ statics ------------------------------*/
425    
426 duarte 802 /**
427     * Deep copies given array of bounds
428     * @param bounds
429     * @return
430     */
431 duarte 821 protected static Bound[][] copyBounds(Bound[][] bounds) {
432 duarte 802 Bound[][] newBounds = new Bound[bounds.length][bounds.length];
433     for (int i=0;i<bounds.length;i++) {
434     for (int j=0;j<bounds[i].length;j++) {
435     if (bounds[i][j]!=null) {
436     newBounds[i][j] = new Bound(bounds[i][j].lower,bounds[i][j].upper);
437     }
438 duarte 799 }
439     }
440 duarte 802 return newBounds;
441 duarte 796 }
442 duarte 804
443 duarte 821 protected static void printBounds(Bound[][] bounds) {
444 duarte 802 for (int i=0;i<bounds.length;i++) {
445     for (int j=0;j<bounds[i].length;j++) {
446     if (bounds[i][j]==null) {
447     System.out.printf("%11s","null");
448     } else {
449     System.out.print(bounds[i][j]);
450     }
451     }
452     System.out.println();
453     }
454     System.out.println();
455     }
456    
457 duarte 796
458     }