ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/embed/BoundsSmoother.java
Revision: 822
Committed: Wed Jan 14 16:08:19 2009 UTC (15 years, 8 months ago) by duarte
File size: 18606 byte(s)
Log Message:
Now Reconstructer can also take RIGraphs with "unobserved" residues (i.e. with missing nodes).
Model pdb files that Reconstructer creates now contain strictly only CA atoms.
Line User Rev File contents
1 duarte 796 package embed;
2    
3 duarte 797 import java.util.ArrayList;
4     import java.util.Collections;
5     import java.util.HashMap;
6 duarte 796 import java.util.Random;
7    
8     import org.apache.commons.collections15.Transformer;
9    
10     import Jama.Matrix;
11    
12     import edu.uci.ics.jung.algorithms.shortestpath.DijkstraDistance;
13 duarte 797 import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath;
14 duarte 796 import edu.uci.ics.jung.graph.SparseGraph;
15     import edu.uci.ics.jung.graph.util.EdgeType;
16     import proteinstructure.AAinfo;
17    
18     /**
19     * Implementation of the bounds smoothing part of the EMBED algorithm of Crippen and Havel
20     * Given a sparse set of distance ranges between a set of atoms it finds distance bounds for
21     * all pairs of atoms, using the triangle inequality.
22     *
23     * Taken from "Distance Geometry: Theory, Algorithms, and Chemical Applications" (section 3.1) by T.F. Havel,
24     * in Encyclopedia of Computational Chemistry (Wiley, New York, 1998).
25 duarte 804 * See also:
26     * "Distance Geometry and Molecular Conformation" (Chapter 5) by G.M. Crippen and T.F. Havel (Wiley)
27     * "Sampling and efficiency of metric matrix distance geometry: A novel partial
28     * metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
29 duarte 796 *
30     * @author duarte
31     *
32     */
33     public class BoundsSmoother {
34    
35 duarte 799 private static final double HARD_SPHERES_BOUND = AAinfo.DIST_MIN_CA ;
36 duarte 800
37     private static final boolean DEBUG = false;
38 duarte 802 private static final long DEBUG_SEED = 123456;
39    
40 duarte 804 private static final int NUM_ROOTS_PARTIAL_METRIZATION = 4; // we choose 4 as in Kuszewski et al.
41 duarte 796
42     /*----------------- helper classes and transformers -----------------*/
43    
44 duarte 797 Transformer<SimpleEdge, Number> WeightTransformer = new Transformer<SimpleEdge, Number>() {
45     public Number transform(SimpleEdge input) {
46 duarte 796 return input.weight;
47     }
48     };
49    
50     private class SimpleEdge {
51     public double weight;
52     public SimpleEdge(double weight) {
53     this.weight = weight;
54     }
55     }
56 duarte 797
57     private class BoundsDigraphNode {
58     public static final boolean LEFT = true;
59     public static final boolean RIGHT = false;
60     private boolean side; // true: left, false: right
61     private int resSerial;
62     public BoundsDigraphNode(int resSerial, boolean side) {
63     this.resSerial = resSerial;
64     this.side = side;
65     }
66     public boolean isRight() {
67     return !side;
68     }
69     public boolean isLeft() {
70     return side;
71     }
72     public int getSerial() {
73     return resSerial;
74     }
75     public boolean equals(Object other) {
76     if (! (other instanceof BoundsDigraphNode)) return false;
77     BoundsDigraphNode otherNode = (BoundsDigraphNode)other;
78     if (otherNode.resSerial==this.resSerial && otherNode.side == this.side) {
79     return true;
80     } else {
81     return false;
82     }
83     }
84     public String toString() {
85     return resSerial+(side?"L":"R");
86     }
87     }
88 duarte 796
89     /*---------------------- member variables ----------------------------*/
90    
91     private int conformationSize;
92 duarte 797 private HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>> nodesBoundsDigraph; // map of serial/side to nodes in the bounds digraph
93     private double lmax; // maximum of the lower bounds: offset value for the boundsDigraph (not to have negative weights so that we can use Dijkstra's algo)
94 duarte 796
95 duarte 804 private Bound[][] bounds; // the bounds (half-)matrix with lower/upper bounds for all pairs (or with nulls for pairs without assigned bounds yet)
96    
97 duarte 802 private Random rand; // the random generator for sampleBounds and metrize
98    
99 duarte 796 /*------------------------ constructors ------------------------------*/
100    
101     /**
102 duarte 822 * Constructs a new BoundsSmoother object given a matrix of Bounds.
103 duarte 796 * @param graph
104     */
105 duarte 821 public BoundsSmoother(Bound[][] bounds) {
106     this.bounds = bounds;
107     this.conformationSize = bounds.length;
108 duarte 796 }
109    
110     /*----------------------- public methods ----------------------------*/
111    
112     /**
113     * Computes bounds for all pairs, based on the set of sparse distance ranges
114 duarte 804 * The returned array is a new array not the reference to the internal bounds array.
115 duarte 822 * @return a 2-dimensional array with the lower bounds for all pairs of residues, the
116     * indices of the array are guaranteed to be in the same order as the residue serials.
117 duarte 796 */
118     public Bound[][] getBoundsAllPairs() {
119 duarte 804 computeTriangleInequality();
120     return copyBounds(bounds); // we return a copy of the internal bounds array so that we can keep modifying it without side-effects to the returned reference
121 duarte 796 }
122 duarte 798
123 duarte 796 /**
124 duarte 821 * Gets a random sample from the internal bounds array of all pairs of distance ranges
125     * @return a symmetric metric matrix (both sides filled)
126     * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
127 duarte 796 */
128 duarte 804 public Matrix sampleBounds() {
129 duarte 821 return sampleBounds(true);
130 duarte 796 }
131 duarte 821
132 duarte 803 /**
133 duarte 804 * Performs partial metrization for the internal bounds array.
134     * The internal bounds array is updated with the new bounds after metrization.
135     * The idea is that metrization doesn't need to be done for all atoms but only
136     * for a handful of them (called roots). This results in a much faster algorithm having
137     * almost the same sampling properties as full metrization.
138     * See "Sampling and efficiency of metric matrix distance geometry: A novel partial
139     * metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
140 duarte 803 * @return
141     */
142 duarte 804 public Matrix metrize() {
143 duarte 803
144 duarte 821 initSeed();
145    
146 duarte 802 ArrayList<Integer> roots = new ArrayList<Integer>();
147     for (int count=1;count<=NUM_ROOTS_PARTIAL_METRIZATION;count++) {
148     int root = rand.nextInt(conformationSize);
149 duarte 804 if (roots.contains(root)) {
150     root = rand.nextInt(conformationSize);
151     }
152     if (roots.contains(root)) {
153     System.err.println("Warning: repeated root atom while doing metrization: "+root);
154     }
155 duarte 803 if (DEBUG) System.out.println("Picked root: "+root);
156 duarte 802 roots.add(root);
157 duarte 804 sampleBoundForRoot(root); // this alters directly the input bounds array
158     computeTriangleInequality();
159 duarte 802 }
160    
161 duarte 803 // finally pick a value at random for all the other bounds
162 duarte 821 return sampleBounds(false);
163 duarte 802 }
164    
165 duarte 821 /*----------------------- private methods ---------------------------*/
166    
167 duarte 796 /**
168 duarte 821 * Initialises (or resets the random seed).
169     * If DEBUG flag is true the random seed will be a fixed value DEBUG_SEED
170 duarte 796 */
171 duarte 821 private void initSeed() {
172     if (DEBUG) {
173     rand = new Random(DEBUG_SEED);
174     } else {
175     rand = new Random();
176     }
177 duarte 796 }
178    
179 duarte 804 /**
180 duarte 821 * Gets a random sample from the internal bounds array of all pairs distance ranges
181     * @param initSeed if true the random seed is reinitialised, if false the existing one will be used
182     * @return a symmetric metric matrix (both sides filled)
183     * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
184     */
185     private Matrix sampleBounds(boolean initSeed) {
186     if (initSeed) {
187     initSeed();
188     }
189     double[][] matrix = new double[conformationSize][conformationSize];
190     for (int i=0;i<conformationSize;i++) {
191     for (int j=0;j<conformationSize;j++) {
192     if (j>i) {
193     matrix[i][j]= bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
194     } else if (j<i) {
195     matrix[i][j]=matrix[j][i];
196     }
197     }
198     }
199     return new Matrix(matrix);
200     }
201    
202     /**
203 duarte 804 * For given root atom samples a value from the distance ranges of the root
204     * to all of its neighbours (updating the internal bounds array with the new sampled bounds)
205     * @param root
206     */
207     private void sampleBoundForRoot(int root) {
208 duarte 803 for (int neighb=0;neighb<conformationSize;neighb++) {
209     if (neighb==root) continue; // avoid the diagonal (which contains a null Bound)
210 duarte 802 int i = root;
211 duarte 803 int j = neighb;
212     if (neighb<root) {
213     i = neighb;
214 duarte 802 j = root;
215     }
216     double sampledValue = bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
217 duarte 803 bounds[i][j].lower = sampledValue;
218     bounds[i][j].upper = sampledValue;
219     if (DEBUG) System.out.print(bounds[i][j]);
220 duarte 802 }
221 duarte 803 if (DEBUG) System.out.println("\n");
222 duarte 802 }
223    
224 duarte 796 /**
225 duarte 804 * Computes bounds for all pairs through triangle inequalities from the bounds member variable
226     * containing a set of lower/upper bounds (sparse or full)
227     * The bounds array is modified to contain the new bounds.
228     * This is guaranteed to run in o(nmlog(n)), thus if the input is sparse then it's pretty fast: o(n2log(n)) but if
229     * the input is the full set of bounds then we have o(n3log(n)).
230 duarte 798 */
231 duarte 804 private void computeTriangleInequality() {
232 duarte 803 double MARGIN = 0.0001; // for comparing doubles we need some tolerance value
233 duarte 804
234     double[][] lowerBounds = getLowerBoundsAllPairs();
235     double[][] upperBounds = getUpperBoundsAllPairs();
236 duarte 802 for (int i=0;i<conformationSize;i++) {
237     for (int j=i+1;j<conformationSize;j++) {
238 duarte 798 double upperBound = upperBounds[i][j];
239     double lowerBound = lowerBounds[i][j];
240 duarte 804
241     if (bounds[i][j]!=null) {
242     if (lowerBound>upperBound+MARGIN) {
243     //System.err.println("old: "+sparseBounds[i][j]+" new: "+new Bound(lowerBound,upperBound));
244    
245     // During metrization sometimes a new upper bound is found that is below the new lower bound
246     // (actually in these cases the new lower bound coincides with the old one i.e. nothing new was
247     // found through triangle inequality for the lower bound).
248     // For some reason it doesn't happen the other way around: a new lower bound found that is
249     // above the new (coinciding with old) upper bound. I suppose this is because the triangle inequality
250     // "is a lot more effective at reducing the upper bounds than increasing the lower bounds" (quoting Havel)
251     // To correct this we set both lower and upper to the newly found upper, i.e. we assume that
252     // the new upper bound is better because is in accordance to the triangle inequality
253     lowerBound=upperBound;
254    
255     //if (upperBound<sparseBounds[i][j].upper-MARGIN)
256     // System.err.printf("new upper bound (%4.1f) for pair %3d %3d is smaller than old upper bound (%4.1f)\n",upperBound,i,j,sparseBounds[i][j].upper);
257     //if (lowerBound>sparseBounds[i][j].lower+MARGIN)
258     // System.err.printf("new lower bound (%4.1f) for pair %3d %3d is bigger than old lower bound (%4.1f)\n",lowerBound,i,j,sparseBounds[i][j].lower);
259     }
260    
261     bounds[i][j].lower = lowerBound;
262     bounds[i][j].upper = upperBound;
263     } else {
264     bounds[i][j] = new Bound(lowerBound,upperBound);
265 duarte 803 }
266 duarte 802 // sanity check: lower bounds can't be bigger than upper bounds!
267 duarte 803 if (lowerBound>upperBound+MARGIN) {
268 duarte 802 System.err.printf("Warning: lower bound (%4.1f) for pair "+i+" "+j+" is bigger than upper bound (%4.1f)\n",lowerBound,upperBound);
269 duarte 798 }
270     }
271     }
272     }
273    
274     /**
275 duarte 804 * Computes upper bounds for all pairs from a set of upper bounds (sparse or full) based
276 duarte 796 * on the triangle inequality.
277     * The computation is actually just an all pairs shortest path using Dijkstra's
278     * shortest path algorithm (as the set of distance coming from contact maps is
279     * very sparse this algorithm should be more efficient than Floyd's: Dijkstra's is
280     * o(nm logn) and Floyd's is o(n3))
281 duarte 822 * @return a 2-dimensional array with the lower bounds for all pairs of residues, the
282     * indices of the array are guaranteed to be in the same order as the residue serials.
283 duarte 796 */
284 duarte 804 private double[][] getUpperBoundsAllPairs() {
285 duarte 796 double[][] upperBoundsMatrix = new double[conformationSize][conformationSize];
286 duarte 804 SparseGraph<Integer,SimpleEdge> upperBoundGraph = convertBoundsMatrixToUpperBoundGraph();
287 duarte 796 DijkstraDistance<Integer, SimpleEdge> dd = new DijkstraDistance<Integer, SimpleEdge>(upperBoundGraph,WeightTransformer);
288 duarte 802
289     for (int i=0;i<conformationSize;i++) {
290     for (int j=i+1;j<conformationSize;j++) {
291     upperBoundsMatrix[i][j] = dd.getDistance(i, j).doubleValue();
292 duarte 796 }
293     }
294     return upperBoundsMatrix;
295     }
296    
297     /**
298 duarte 804 * Computes lower bounds for all pairs given a set of upper/lower bounds (sparse or full)
299     * based on the triangle inequality.
300 duarte 796 *
301 duarte 797 * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
302     * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Because of this the boundsDigraph result
303 duarte 804 * of calling {@link #convertBoundsMatrixToBoundsDigraph()}} have values offset with the maximum lower bound (lmax).
304 duarte 797 * Thus after computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
305     *
306 duarte 822 * @return a 2-dimensional array with the lower bounds for all pairs of residues, the
307     * indices of the array are guaranteed to be in the same order as the residue serials.
308 duarte 804 * @see {@link #convertBoundsMatrixToBoundsDigraph()} and {@link #getUpperBoundsAllPairs()}
309 duarte 796 */
310 duarte 804 private double[][] getLowerBoundsAllPairs() {
311 duarte 797 double[][] lowerBoundsMatrix = new double[conformationSize][conformationSize];
312     // this is the bounds digraph as described by Crippen and Havel
313 duarte 804 SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = convertBoundsMatrixToBoundsDigraph();
314 duarte 797 DijkstraShortestPath<BoundsDigraphNode, SimpleEdge> dd = new DijkstraShortestPath<BoundsDigraphNode, SimpleEdge>(boundsDigraph,WeightTransformer);
315 duarte 802
316     for (int i=0;i<conformationSize;i++) {
317     for (int j=i+1;j<conformationSize;j++) {
318     int hops = dd.getPath(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)).size();
319     double lower = Math.abs(
320 duarte 797 (dd.getDistance(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
321 duarte 802 nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)
322     ).doubleValue())
323 duarte 797 - (hops*lmax)); // the lower limit for the triangle inequality is: Math.abs(shortestpath-(hops*lmax))
324 duarte 802 lowerBoundsMatrix[i][j] = Math.max(lower, HARD_SPHERES_BOUND); // we only set the new lower bound to the one found if is above the HARD_SPHERES_BOUND
325 duarte 796 }
326     }
327 duarte 797 return lowerBoundsMatrix;
328 duarte 796
329     }
330    
331     /**
332 duarte 802 * Converts the bounds matrix to a graph with only the upper bounds: nodes indices of bounds
333     * matrix, edges upper bounds (in SimpleEdge objects containing the upper bound value
334 duarte 796 * in their weight field).
335 duarte 802 * @param bounds
336 duarte 796 * @return
337     */
338 duarte 804 private SparseGraph<Integer,SimpleEdge> convertBoundsMatrixToUpperBoundGraph() {
339 duarte 796 SparseGraph<Integer,SimpleEdge> upperBoundGraph = new SparseGraph<Integer, SimpleEdge>();
340 duarte 802 for (int i=0;i<conformationSize;i++) {
341     for (int j=0;j<conformationSize;j++) {
342     if (bounds[i][j]!=null) {
343     upperBoundGraph.addEdge(new SimpleEdge(bounds[i][j].upper), i, j, EdgeType.UNDIRECTED);
344     }
345     }
346 duarte 796 }
347     return upperBoundGraph;
348     }
349 duarte 797
350 duarte 796 /**
351 duarte 797 * Constructs a bounds digraph (as described by Crippen and Havel) to compute the triangle inequality limits
352     * for the lower bounds.
353     * The graph is composed by 2 subgraphs (we call them left and right) each of them containing the set of all atoms.
354     * Within the subgraphs there is an undirected edge between atoms i,j with weight the upper bounds for i,j
355     * Between the subgraphs there is a directed edge from left to right between atoms i(left) to j(right)
356     * with weight the negative of the lower bound i,j
357     * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
358     * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Thus we offset the values here to the maximum lower bound.
359     * After computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
360     * @return
361     */
362 duarte 804 private SparseGraph<BoundsDigraphNode,SimpleEdge> convertBoundsMatrixToBoundsDigraph() {
363 duarte 797 // to do the offset thing (see docs above) we need to know first of all the max lower bound
364     ArrayList<Double> lowerBounds = new ArrayList<Double>();
365 duarte 802 for (int i=0;i<conformationSize;i++) {
366     for (int j=0;j<conformationSize;j++) {
367     if (bounds[i][j]!=null) {
368     lowerBounds.add(bounds[i][j].lower);
369     }
370     }
371 duarte 797 }
372     lmax = Collections.max(lowerBounds); // this is the offset value
373    
374     SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = new SparseGraph<BoundsDigraphNode, SimpleEdge>();
375     // we have to store all nodes in a HashMap, so we can retrieve them by residue serial and side after
376     nodesBoundsDigraph = new HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>>();
377     nodesBoundsDigraph.put(BoundsDigraphNode.LEFT , new HashMap<Integer, BoundsDigraphNode>());
378     nodesBoundsDigraph.put(BoundsDigraphNode.RIGHT, new HashMap<Integer, BoundsDigraphNode>());
379     // first we create the nodes and store them into the HashMap
380 duarte 802 for (int i=0;i<conformationSize;i++) {
381 duarte 797 BoundsDigraphNode leftNode = new BoundsDigraphNode(i, BoundsDigraphNode.LEFT);
382     BoundsDigraphNode rightNode = new BoundsDigraphNode(i, BoundsDigraphNode.RIGHT);
383     boundsDigraph.addVertex(leftNode);
384     boundsDigraph.addVertex(rightNode);
385     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).put(i,leftNode);
386     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).put(i,rightNode);
387     }
388    
389 duarte 802 for (int i=0;i<conformationSize;i++) {
390     for (int j=0;j<conformationSize;j++) {
391     if (bounds[i][j]!=null) {
392     // first we add the upper bounds as undirected edges to the 2 subgraphs (left and right)
393     boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
394     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
395     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(j),
396     EdgeType.UNDIRECTED);
397     boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
398     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(i),
399     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
400     EdgeType.UNDIRECTED);
401     // then we add the negative of the lower bounds as directed edges connecting nodes of subgraph left to subgraph right
402     boundsDigraph.addEdge(new SimpleEdge(lmax-bounds[i][j].lower),
403     nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
404     nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
405     EdgeType.DIRECTED);
406     }
407     }
408 duarte 797 }
409     return boundsDigraph;
410     }
411    
412 duarte 821 protected void printBounds() {
413 duarte 804 printBounds(this.bounds);
414     }
415 duarte 802
416 duarte 804 /*------------------------ statics ------------------------------*/
417    
418 duarte 802 /**
419     * Deep copies given array of bounds
420     * @param bounds
421     * @return
422     */
423 duarte 821 protected static Bound[][] copyBounds(Bound[][] bounds) {
424 duarte 802 Bound[][] newBounds = new Bound[bounds.length][bounds.length];
425     for (int i=0;i<bounds.length;i++) {
426     for (int j=0;j<bounds[i].length;j++) {
427     if (bounds[i][j]!=null) {
428     newBounds[i][j] = new Bound(bounds[i][j].lower,bounds[i][j].upper);
429     }
430 duarte 799 }
431     }
432 duarte 802 return newBounds;
433 duarte 796 }
434 duarte 804
435 duarte 821 protected static void printBounds(Bound[][] bounds) {
436 duarte 802 for (int i=0;i<bounds.length;i++) {
437     for (int j=0;j<bounds[i].length;j++) {
438     if (bounds[i][j]==null) {
439     System.out.printf("%11s","null");
440     } else {
441     System.out.print(bounds[i][j]);
442     }
443     }
444     System.out.println();
445     }
446     System.out.println();
447     }
448    
449 duarte 796
450     }