trunk/embed/BoundsSmoother.java

package embed;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Random;

import org.apache.commons.collections15.Transformer;

import Jama.Matrix;

import edu.uci.ics.jung.algorithms.shortestpath.DijkstraDistance;
import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath;
import edu.uci.ics.jung.graph.SparseGraph;
import edu.uci.ics.jung.graph.util.EdgeType;
import proteinstructure.AAinfo;

/**
 * Implementation of the bounds smoothing part of the EMBED algorithm of Crippen and Havel
 * Given a sparse set of distance ranges between a set of atoms it finds distance bounds for
 * all pairs of atoms, using the triangle inequality.
 *  
 * Taken from "Distance Geometry: Theory, Algorithms, and Chemical Applications" (section 3.1) by T.F. Havel, 
 * in Encyclopedia of Computational Chemistry (Wiley, New York, 1998). 
 * See also:
 *  "Distance Geometry and Molecular Conformation" (Chapter 5) by G.M. Crippen and T.F. Havel (Wiley)
 *  "Sampling and efficiency of metric matrix distance geometry: A novel partial
 *   metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
 *   
 * @author duarte
 *
 */
public class BoundsSmoother {
        
        private static final double HARD_SPHERES_BOUND = AAinfo.DIST_MIN_CA ;
        
        private static final boolean DEBUG = false;
        private static final long DEBUG_SEED = 123456;
        
        private static final int NUM_ROOTS_PARTIAL_METRIZATION = 4; // we choose 4 as in Kuszewski et al.

        /*----------------- helper classes and transformers -----------------*/
        
        Transformer<SimpleEdge, Number> WeightTransformer = new Transformer<SimpleEdge, Number>() {
                public Number transform(SimpleEdge input) {
                        return input.weight;
                }
        };
        
        private class SimpleEdge {
                public double weight;
                public SimpleEdge(double weight) {
                        this.weight = weight;
                }
        }
        
        private class BoundsDigraphNode {
                public static final boolean LEFT = true;
                public static final boolean RIGHT = false;
                private boolean side; // true: left, false: right
                private int resSerial;
                public BoundsDigraphNode(int resSerial, boolean side) {
                        this.resSerial = resSerial;
                        this.side = side;
                }
                public boolean isRight() {
                        return !side;
                }
                public boolean isLeft() {
                        return side;
                }
                public int getSerial() {
                        return resSerial;
                }
                public boolean equals(Object other) {
                        if (! (other instanceof BoundsDigraphNode)) return false;
                        BoundsDigraphNode otherNode = (BoundsDigraphNode)other;
                        if (otherNode.resSerial==this.resSerial && otherNode.side == this.side) {
                                return true;
                        } else {
                                return false;
                        }
                }
                public String toString() {
                        return resSerial+(side?"L":"R");
                }
        }

        /*---------------------- member variables ----------------------------*/
        
        private int conformationSize;
        private HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>> nodesBoundsDigraph; // map of serial/side to nodes in the bounds digraph
        private double lmax; // maximum of the lower bounds: offset value for the boundsDigraph (not to have negative weights so that we can use Dijkstra's algo)
        
        private Bound[][] bounds; // the bounds (half-)matrix with lower/upper bounds for all pairs (or with nulls for pairs without assigned bounds yet) 
        
        private Random rand; // the random generator for sampleBounds and metrize
        
        /*------------------------ constructors ------------------------------*/
        
        /**
         * Constructs a new BoundsSmoother object given a matrix of Bounds.
         * @param graph
         */
        public BoundsSmoother(Bound[][] bounds) {
                this.bounds = bounds;
                this.conformationSize = bounds.length;
        }
        
        /*----------------------- public methods  ----------------------------*/
        
        /**
         * Computes bounds for all pairs, based on the set of sparse distance ranges
         * The returned array is a new array not the reference to the internal bounds array.
         * @return a 2-dimensional array with the lower bounds for all pairs of residues, the 
         * indices of the array are guaranteed to be in the same order as the residue serials.
         */
        public Bound[][] getBoundsAllPairs() {
                computeTriangleInequality();
                return copyBounds(bounds); // we return a copy of the internal bounds array so that we can keep modifying it without side-effects to the returned reference
        }
                
        /**
         * Gets a random sample from the internal bounds array of all pairs of distance ranges
         * @return a symmetric metric matrix (both sides filled)
         * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs 
         */
        public Matrix sampleBounds() {
                return sampleBounds(true);
        }
        
        /**
         * Performs partial metrization for the internal bounds array.
         * The internal bounds array is updated with the new bounds after metrization.
         * The idea is that metrization doesn't need to be done for all atoms but only 
         * for a handful of them (called roots). This results in a much faster algorithm having 
         * almost the same sampling properties as full metrization.
         * See "Sampling and efficiency of metric matrix distance geometry: A novel partial
         * metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
         * @return
         */
        public Matrix metrize() {

                initSeed();
                
                ArrayList<Integer> roots = new ArrayList<Integer>();
                for (int count=1;count<=NUM_ROOTS_PARTIAL_METRIZATION;count++) {
                        int root = rand.nextInt(conformationSize);
                        if (roots.contains(root)) {
                                root = rand.nextInt(conformationSize);
                        }       
                        if (roots.contains(root)) { 
                                System.err.println("Warning: repeated root atom while doing metrization: "+root);
                        }
                        if (DEBUG) System.out.println("Picked root: "+root);                    
                        roots.add(root);
                        sampleBoundForRoot(root); // this alters directly the input bounds array
                        computeTriangleInequality();
                }
                
                // finally pick a value at random for all the other bounds
                return sampleBounds(false);             
        }
        
        /**
         * Gets a reference to the current bounds matrix 
         * @return
         */
        protected Bound[][] getBounds() {
                return bounds;
        }
        
        /*----------------------- private methods  ---------------------------*/
        
        /**
         * Initialises (or resets the random seed).
         * If DEBUG flag is true the random seed will be a fixed value DEBUG_SEED
         */
        private void initSeed() {
                if (DEBUG) {
                        rand = new Random(DEBUG_SEED);
                } else {
                        rand = new Random();
                } 
        }
        
        /**
         * Gets a random sample from the internal bounds array of all pairs distance ranges
         * @param initSeed if true the random seed is reinitialised, if false the existing one will be used
         * @return a symmetric metric matrix (both sides filled)
         * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
         */
        private Matrix sampleBounds(boolean initSeed) {
                if (initSeed) {
                        initSeed();
                }
                double[][] matrix = new double[conformationSize][conformationSize];
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (j>i) {
                                        matrix[i][j]= bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
                                } else if (j<i) {
                                        matrix[i][j]=matrix[j][i];
                                }
                        }
                }
                return new Matrix(matrix);
        }

        /**
         * For given root atom samples a value from the distance ranges of the root 
         * to all of its neighbours (updating the internal bounds array with the new sampled bounds)
         * @param root
         */
        private void sampleBoundForRoot(int root) {
                for (int neighb=0;neighb<conformationSize;neighb++) {
                        if (neighb==root) continue; // avoid the diagonal (which contains a null Bound)
                        int i = root;
                        int j = neighb;
                        if (neighb<root) {
                                i = neighb;
                                j = root;
                        }
                        double sampledValue = bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
                        bounds[i][j].lower = sampledValue;
                        bounds[i][j].upper = sampledValue;
                        if (DEBUG) System.out.print(bounds[i][j]);
                }
                if (DEBUG) System.out.println("\n");
        }
        
        /**
         * Computes bounds for all pairs through triangle inequalities from the bounds member variable
         * containing a set of lower/upper bounds (sparse or full)
         * The bounds array is modified to contain the new bounds. 
         * This is guaranteed to run in o(nmlog(n)), thus if the input is sparse then it's pretty fast: o(n2log(n)) but if 
         * the input is the full set of bounds then we have o(n3log(n)).
         */
        private void computeTriangleInequality() { 
                double MARGIN = 0.0001; // for comparing doubles we need some tolerance value

                double[][] lowerBounds = getLowerBoundsAllPairs();
                double[][] upperBounds = getUpperBoundsAllPairs();
                for (int i=0;i<conformationSize;i++) {
                        for (int j=i+1;j<conformationSize;j++) {
                                double upperBound = upperBounds[i][j];
                                double lowerBound = lowerBounds[i][j];
                                
                                if (bounds[i][j]!=null) {
                                        if (lowerBound>upperBound+MARGIN) {
                                                //System.err.println("old: "+sparseBounds[i][j]+" new: "+new Bound(lowerBound,upperBound));

                                                // During metrization sometimes a new upper bound is found that is below the new lower bound 
                                                // (actually in these cases the new lower bound coincides with the old one i.e. nothing new was 
                                                // found through triangle inequality for the lower bound).
                                                // For some reason it doesn't happen the other way around: a new lower bound found that is 
                                                // above the new (coinciding with old) upper bound. I suppose this is because the triangle inequality 
                                                // "is a lot more effective at reducing the upper bounds than increasing the lower bounds" (quoting Havel) 
                                                // To correct this we set both lower and upper to the newly found upper, i.e. we assume that 
                                                // the new upper bound is better because is in accordance to the triangle inequality 
                                                lowerBound=upperBound;

                                                //if (upperBound<sparseBounds[i][j].upper-MARGIN) 
                                                //      System.err.printf("new upper bound (%4.1f) for pair %3d %3d is smaller than old upper bound (%4.1f)\n",upperBound,i,j,sparseBounds[i][j].upper);
                                                //if (lowerBound>sparseBounds[i][j].lower+MARGIN) 
                                                //      System.err.printf("new lower bound (%4.1f) for pair %3d %3d is bigger than old lower bound (%4.1f)\n",lowerBound,i,j,sparseBounds[i][j].lower);
                                        }

                                        bounds[i][j].lower = lowerBound;
                                        bounds[i][j].upper = upperBound;
                                } else {
                                        bounds[i][j] = new Bound(lowerBound,upperBound);
                                }
                                // sanity check: lower bounds can't be bigger than upper bounds!
                                if (lowerBound>upperBound+MARGIN) {
                                        System.err.printf("Warning: lower bound (%4.1f) for pair "+i+" "+j+" is bigger than upper bound (%4.1f)\n",lowerBound,upperBound);
                                }
                        }
                }
        }

        /**
         * Computes upper bounds for all pairs from a set of upper bounds (sparse or full) based
         * on the triangle inequality.
         * The computation is actually just an all pairs shortest path using Dijkstra's 
         * shortest path algorithm (as the set of distance coming from contact maps is
         * very sparse this algorithm should be more efficient than Floyd's: Dijkstra's is
         * o(nm logn) and Floyd's is o(n3))
         * @return a 2-dimensional array with the lower bounds for all pairs of residues, the 
         * indices of the array are guaranteed to be in the same order as the residue serials.
         */
        private double[][] getUpperBoundsAllPairs() {
                double[][] upperBoundsMatrix = new double[conformationSize][conformationSize];
                SparseGraph<Integer,SimpleEdge> upperBoundGraph = convertBoundsMatrixToUpperBoundGraph();
                DijkstraDistance<Integer, SimpleEdge> dd = new DijkstraDistance<Integer, SimpleEdge>(upperBoundGraph,WeightTransformer);

                for (int i=0;i<conformationSize;i++) {
                        for (int j=i+1;j<conformationSize;j++) {
                                upperBoundsMatrix[i][j] = dd.getDistance(i, j).doubleValue();
                        }
                }
                return upperBoundsMatrix;
        }
        
        /**
         * Computes lower bounds for all pairs given a set of upper/lower bounds (sparse or full) 
         * based on the triangle inequality.
         * 
         * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative 
         * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Because of this the boundsDigraph result 
         * of calling {@link #convertBoundsMatrixToBoundsDigraph()}} have values offset with the maximum lower bound (lmax).
         * Thus after computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
         *  
         * @return a 2-dimensional array with the lower bounds for all pairs of residues, the 
         * indices of the array are guaranteed to be in the same order as the residue serials. 
         * @see {@link #convertBoundsMatrixToBoundsDigraph()} and {@link #getUpperBoundsAllPairs()}
         */
        private double[][] getLowerBoundsAllPairs() {
                double[][] lowerBoundsMatrix = new double[conformationSize][conformationSize];          
                // this is the bounds digraph as described by Crippen and Havel
                SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = convertBoundsMatrixToBoundsDigraph();
                DijkstraShortestPath<BoundsDigraphNode, SimpleEdge> dd = new DijkstraShortestPath<BoundsDigraphNode, SimpleEdge>(boundsDigraph,WeightTransformer);

                for (int i=0;i<conformationSize;i++) {
                        for (int j=i+1;j<conformationSize;j++) {
                                int hops = dd.getPath(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)).size();
                                double lower = Math.abs(
                                                (dd.getDistance(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), 
                                                                nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)
                                                ).doubleValue()) 
                                                - (hops*lmax)); // the lower limit for the triangle inequality is: Math.abs(shortestpath-(hops*lmax))
                                lowerBoundsMatrix[i][j] = Math.max(lower, HARD_SPHERES_BOUND); // we only set the new lower bound to the one found if is above the HARD_SPHERES_BOUND
                        }
                }
                return lowerBoundsMatrix;
                
        }

        /**
         * Converts the bounds matrix to a graph with only the upper bounds: nodes indices of bounds 
         * matrix, edges upper bounds (in SimpleEdge objects containing the upper bound value 
         * in their weight field).
         * @param bounds
         * @return
         */
        private SparseGraph<Integer,SimpleEdge> convertBoundsMatrixToUpperBoundGraph() {
                SparseGraph<Integer,SimpleEdge> upperBoundGraph = new SparseGraph<Integer, SimpleEdge>();
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (bounds[i][j]!=null) {
                                        upperBoundGraph.addEdge(new SimpleEdge(bounds[i][j].upper), i, j, EdgeType.UNDIRECTED);
                                }
                        }
                }
                return upperBoundGraph;
        }

        /**
         * Constructs a bounds digraph (as described by Crippen and Havel) to compute the triangle inequality limits
         * for the lower bounds.
         * The graph is composed by 2 subgraphs (we call them left and right) each of them containing the set of all atoms. 
         * Within the subgraphs there is an undirected edge between atoms i,j with weight the upper bounds for i,j
         * Between the subgraphs there is a directed edge from left to right between atoms i(left) to j(right) 
         * with weight the negative of the lower bound i,j
         * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative 
         * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Thus we offset the values here to the maximum lower bound.
         * After computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
         * @return
         */
        private SparseGraph<BoundsDigraphNode,SimpleEdge> convertBoundsMatrixToBoundsDigraph() {
                // to do the offset thing (see docs above) we need to know first of all the max lower bound
                ArrayList<Double> lowerBounds = new ArrayList<Double>();
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (bounds[i][j]!=null) {
                                        lowerBounds.add(bounds[i][j].lower);
                                }
                        }
                }
                lmax = Collections.max(lowerBounds); // this is the offset value
                
                SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = new SparseGraph<BoundsDigraphNode, SimpleEdge>();
                // we have to store all nodes in a HashMap, so we can retrieve them by residue serial and side after 
                nodesBoundsDigraph = new HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>>();
                nodesBoundsDigraph.put(BoundsDigraphNode.LEFT , new HashMap<Integer, BoundsDigraphNode>());
                nodesBoundsDigraph.put(BoundsDigraphNode.RIGHT, new HashMap<Integer, BoundsDigraphNode>());
                // first we create the nodes and store them into the HashMap
                for (int i=0;i<conformationSize;i++) {
                        BoundsDigraphNode leftNode = new BoundsDigraphNode(i, BoundsDigraphNode.LEFT);
                        BoundsDigraphNode rightNode = new BoundsDigraphNode(i, BoundsDigraphNode.RIGHT);
                        boundsDigraph.addVertex(leftNode);
                        boundsDigraph.addVertex(rightNode);
                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).put(i,leftNode);
                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).put(i,rightNode);
                }
                
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (bounds[i][j]!=null) {
                                        // first we add the upper bounds as undirected edges to the 2 subgraphs (left and right)
                                        boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(j), 
                                                        EdgeType.UNDIRECTED);
                                        boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(i), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j), 
                                                        EdgeType.UNDIRECTED);
                                        // then we add the negative of the lower bounds as directed edges connecting nodes of subgraph left to subgraph right
                                        boundsDigraph.addEdge(new SimpleEdge(lmax-bounds[i][j].lower), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j), 
                                                        EdgeType.DIRECTED);             
                                }
                        }
                }
                return boundsDigraph;
        }

        protected void printBounds() {
                printBounds(this.bounds);
        }
        
        /*------------------------ statics  ------------------------------*/

        /**
         * Deep copies given array of bounds
         * @param bounds
         * @return
         */
        protected static Bound[][] copyBounds(Bound[][] bounds) {
                Bound[][] newBounds = new Bound[bounds.length][bounds.length];
                for (int i=0;i<bounds.length;i++) {
                        for (int j=0;j<bounds[i].length;j++) {
                                if (bounds[i][j]!=null) {
                                        newBounds[i][j] = new Bound(bounds[i][j].lower,bounds[i][j].upper);
                                }
                        }
                }
                return newBounds;
        }

        protected static void printBounds(Bound[][] bounds) {
                for (int i=0;i<bounds.length;i++) {
                        for (int j=0;j<bounds[i].length;j++) {
                                if (bounds[i][j]==null) {
                                        System.out.printf("%11s","null");
                                } else {
                                        System.out.print(bounds[i][j]);
                                }
                        }
                        System.out.println();
                }
                System.out.println();
        }

        
}
Revision:	823
Committed:	Thu Jan 15 16:09:59 2009 UTC (15 years, 8 months ago) by duarte
File size:	18736 byte(s)
Log Message:	Now printing restrains viols and bounds viols.
Line	User	Rev	File contents
1	duarte	796	package embed;
2
3	duarte	797	import java.util.ArrayList;
4			import java.util.Collections;
5			import java.util.HashMap;
6	duarte	796	import java.util.Random;
7
8			import org.apache.commons.collections15.Transformer;
9
10			import Jama.Matrix;
11
12			import edu.uci.ics.jung.algorithms.shortestpath.DijkstraDistance;
13	duarte	797	import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath;
14	duarte	796	import edu.uci.ics.jung.graph.SparseGraph;
15			import edu.uci.ics.jung.graph.util.EdgeType;
16			import proteinstructure.AAinfo;
17
18			/**
19			* Implementation of the bounds smoothing part of the EMBED algorithm of Crippen and Havel
20			* Given a sparse set of distance ranges between a set of atoms it finds distance bounds for
21			* all pairs of atoms, using the triangle inequality.
22			*
23			* Taken from "Distance Geometry: Theory, Algorithms, and Chemical Applications" (section 3.1) by T.F. Havel,
24			* in Encyclopedia of Computational Chemistry (Wiley, New York, 1998).
25	duarte	804	* See also:
26			* "Distance Geometry and Molecular Conformation" (Chapter 5) by G.M. Crippen and T.F. Havel (Wiley)
27			* "Sampling and efficiency of metric matrix distance geometry: A novel partial
28			* metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
29	duarte	796	*
30			* @author duarte
31			*
32			*/
33			public class BoundsSmoother {
34
35	duarte	799	private static final double HARD_SPHERES_BOUND = AAinfo.DIST_MIN_CA ;
36	duarte	800
37			private static final boolean DEBUG = false;
38	duarte	802	private static final long DEBUG_SEED = 123456;
39
40	duarte	804	private static final int NUM_ROOTS_PARTIAL_METRIZATION = 4; // we choose 4 as in Kuszewski et al.
41	duarte	796
42			/----------------- helper classes and transformers -----------------/
43
44	duarte	797	Transformer<SimpleEdge, Number> WeightTransformer = new Transformer<SimpleEdge, Number>() {
45			public Number transform(SimpleEdge input) {
46	duarte	796	return input.weight;
47			}
48			};
49
50			private class SimpleEdge {
51			public double weight;
52			public SimpleEdge(double weight) {
53			this.weight = weight;
54			}
55			}
56	duarte	797
57			private class BoundsDigraphNode {
58			public static final boolean LEFT = true;
59			public static final boolean RIGHT = false;
60			private boolean side; // true: left, false: right
61			private int resSerial;
62			public BoundsDigraphNode(int resSerial, boolean side) {
63			this.resSerial = resSerial;
64			this.side = side;
65			}
66			public boolean isRight() {
67			return !side;
68			}
69			public boolean isLeft() {
70			return side;
71			}
72			public int getSerial() {
73			return resSerial;
74			}
75			public boolean equals(Object other) {
76			if (! (other instanceof BoundsDigraphNode)) return false;
77			BoundsDigraphNode otherNode = (BoundsDigraphNode)other;
78			if (otherNode.resSerial==this.resSerial && otherNode.side == this.side) {
79			return true;
80			} else {
81			return false;
82			}
83			}
84			public String toString() {
85			return resSerial+(side?"L":"R");
86			}
87			}
88	duarte	796
89			/---------------------- member variables ----------------------------/
90
91			private int conformationSize;
92	duarte	797	private HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>> nodesBoundsDigraph; // map of serial/side to nodes in the bounds digraph
93			private double lmax; // maximum of the lower bounds: offset value for the boundsDigraph (not to have negative weights so that we can use Dijkstra's algo)
94	duarte	796
95	duarte	804	private Bound[][] bounds; // the bounds (half-)matrix with lower/upper bounds for all pairs (or with nulls for pairs without assigned bounds yet)
96
97	duarte	802	private Random rand; // the random generator for sampleBounds and metrize
98
99	duarte	796	/------------------------ constructors ------------------------------/
100
101			/**
102	duarte	822	* Constructs a new BoundsSmoother object given a matrix of Bounds.
103	duarte	796	* @param graph
104			*/
105	duarte	821	public BoundsSmoother(Bound[][] bounds) {
106			this.bounds = bounds;
107			this.conformationSize = bounds.length;
108	duarte	796	}
109
110			/----------------------- public methods ----------------------------/
111
112			/**
113			* Computes bounds for all pairs, based on the set of sparse distance ranges
114	duarte	804	* The returned array is a new array not the reference to the internal bounds array.
115	duarte	822	* @return a 2-dimensional array with the lower bounds for all pairs of residues, the
116			* indices of the array are guaranteed to be in the same order as the residue serials.
117	duarte	796	*/
118			public Bound[][] getBoundsAllPairs() {
119	duarte	804	computeTriangleInequality();
120			return copyBounds(bounds); // we return a copy of the internal bounds array so that we can keep modifying it without side-effects to the returned reference
121	duarte	796	}
122	duarte	798
123	duarte	796	/**
124	duarte	821	* Gets a random sample from the internal bounds array of all pairs of distance ranges
125			* @return a symmetric metric matrix (both sides filled)
126			* @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
127	duarte	796	*/
128	duarte	804	public Matrix sampleBounds() {
129	duarte	821	return sampleBounds(true);
130	duarte	796	}
131	duarte	821
132	duarte	803	/**
133	duarte	804	* Performs partial metrization for the internal bounds array.
134			* The internal bounds array is updated with the new bounds after metrization.
135			* The idea is that metrization doesn't need to be done for all atoms but only
136			* for a handful of them (called roots). This results in a much faster algorithm having
137			* almost the same sampling properties as full metrization.
138			* See "Sampling and efficiency of metric matrix distance geometry: A novel partial
139			* metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
140	duarte	803	* @return
141			*/
142	duarte	804	public Matrix metrize() {
143	duarte	803
144	duarte	821	initSeed();
145
146	duarte	802	ArrayList<Integer> roots = new ArrayList<Integer>();
147			for (int count=1;count<=NUM_ROOTS_PARTIAL_METRIZATION;count++) {
148			int root = rand.nextInt(conformationSize);
149	duarte	804	if (roots.contains(root)) {
150			root = rand.nextInt(conformationSize);
151			}
152			if (roots.contains(root)) {
153			System.err.println("Warning: repeated root atom while doing metrization: "+root);
154			}
155	duarte	803	if (DEBUG) System.out.println("Picked root: "+root);
156	duarte	802	roots.add(root);
157	duarte	804	sampleBoundForRoot(root); // this alters directly the input bounds array
158			computeTriangleInequality();
159	duarte	802	}
160
161	duarte	803	// finally pick a value at random for all the other bounds
162	duarte	821	return sampleBounds(false);
163	duarte	802	}
164
165	duarte	823	/**
166			* Gets a reference to the current bounds matrix
167			* @return
168			*/
169			protected Bound[][] getBounds() {
170			return bounds;
171			}
172
173	duarte	821	/----------------------- private methods ---------------------------/
174
175	duarte	796	/**
176	duarte	821	* Initialises (or resets the random seed).
177			* If DEBUG flag is true the random seed will be a fixed value DEBUG_SEED
178	duarte	796	*/
179	duarte	821	private void initSeed() {
180			if (DEBUG) {
181			rand = new Random(DEBUG_SEED);
182			} else {
183			rand = new Random();
184			}
185	duarte	796	}
186
187	duarte	804	/**
188	duarte	821	* Gets a random sample from the internal bounds array of all pairs distance ranges
189			* @param initSeed if true the random seed is reinitialised, if false the existing one will be used
190			* @return a symmetric metric matrix (both sides filled)
191			* @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
192			*/
193			private Matrix sampleBounds(boolean initSeed) {
194			if (initSeed) {
195			initSeed();
196			}
197			double[][] matrix = new double[conformationSize][conformationSize];
198			for (int i=0;i<conformationSize;i++) {
199			for (int j=0;j<conformationSize;j++) {
200			if (j>i) {
201			matrix[i][j]= bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
202			} else if (j<i) {
203			matrix[i][j]=matrix[j][i];
204			}
205			}
206			}
207			return new Matrix(matrix);
208			}
209
210			/**
211	duarte	804	* For given root atom samples a value from the distance ranges of the root
212			* to all of its neighbours (updating the internal bounds array with the new sampled bounds)
213			* @param root
214			*/
215			private void sampleBoundForRoot(int root) {
216	duarte	803	for (int neighb=0;neighb<conformationSize;neighb++) {
217			if (neighb==root) continue; // avoid the diagonal (which contains a null Bound)
218	duarte	802	int i = root;
219	duarte	803	int j = neighb;
220			if (neighb<root) {
221			i = neighb;
222	duarte	802	j = root;
223			}
224			double sampledValue = bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
225	duarte	803	bounds[i][j].lower = sampledValue;
226			bounds[i][j].upper = sampledValue;
227			if (DEBUG) System.out.print(bounds[i][j]);
228	duarte	802	}
229	duarte	803	if (DEBUG) System.out.println("\n");
230	duarte	802	}
231
232	duarte	796	/**
233	duarte	804	* Computes bounds for all pairs through triangle inequalities from the bounds member variable
234			* containing a set of lower/upper bounds (sparse or full)
235			* The bounds array is modified to contain the new bounds.
236			* This is guaranteed to run in o(nmlog(n)), thus if the input is sparse then it's pretty fast: o(n2log(n)) but if
237			* the input is the full set of bounds then we have o(n3log(n)).
238	duarte	798	*/
239	duarte	804	private void computeTriangleInequality() {
240	duarte	803	double MARGIN = 0.0001; // for comparing doubles we need some tolerance value
241	duarte	804
242			double[][] lowerBounds = getLowerBoundsAllPairs();
243			double[][] upperBounds = getUpperBoundsAllPairs();
244	duarte	802	for (int i=0;i<conformationSize;i++) {
245			for (int j=i+1;j<conformationSize;j++) {
246	duarte	798	double upperBound = upperBounds[i][j];
247			double lowerBound = lowerBounds[i][j];
248	duarte	804
249			if (bounds[i][j]!=null) {
250			if (lowerBound>upperBound+MARGIN) {
251			//System.err.println("old: "+sparseBounds[i][j]+" new: "+new Bound(lowerBound,upperBound));
252
253			// During metrization sometimes a new upper bound is found that is below the new lower bound
254			// (actually in these cases the new lower bound coincides with the old one i.e. nothing new was
255			// found through triangle inequality for the lower bound).
256			// For some reason it doesn't happen the other way around: a new lower bound found that is
257			// above the new (coinciding with old) upper bound. I suppose this is because the triangle inequality
258			// "is a lot more effective at reducing the upper bounds than increasing the lower bounds" (quoting Havel)
259			// To correct this we set both lower and upper to the newly found upper, i.e. we assume that
260			// the new upper bound is better because is in accordance to the triangle inequality
261			lowerBound=upperBound;
262
263			//if (upperBound<sparseBounds[i][j].upper-MARGIN)
264			// System.err.printf("new upper bound (%4.1f) for pair %3d %3d is smaller than old upper bound (%4.1f)\n",upperBound,i,j,sparseBounds[i][j].upper);
265			//if (lowerBound>sparseBounds[i][j].lower+MARGIN)
266			// System.err.printf("new lower bound (%4.1f) for pair %3d %3d is bigger than old lower bound (%4.1f)\n",lowerBound,i,j,sparseBounds[i][j].lower);
267			}
268
269			bounds[i][j].lower = lowerBound;
270			bounds[i][j].upper = upperBound;
271			} else {
272			bounds[i][j] = new Bound(lowerBound,upperBound);
273	duarte	803	}
274	duarte	802	// sanity check: lower bounds can't be bigger than upper bounds!
275	duarte	803	if (lowerBound>upperBound+MARGIN) {
276	duarte	802	System.err.printf("Warning: lower bound (%4.1f) for pair "+i+" "+j+" is bigger than upper bound (%4.1f)\n",lowerBound,upperBound);
277	duarte	798	}
278			}
279			}
280			}
281
282			/**
283	duarte	804	* Computes upper bounds for all pairs from a set of upper bounds (sparse or full) based
284	duarte	796	* on the triangle inequality.
285			* The computation is actually just an all pairs shortest path using Dijkstra's
286			* shortest path algorithm (as the set of distance coming from contact maps is
287			* very sparse this algorithm should be more efficient than Floyd's: Dijkstra's is
288			* o(nm logn) and Floyd's is o(n3))
289	duarte	822	* @return a 2-dimensional array with the lower bounds for all pairs of residues, the
290			* indices of the array are guaranteed to be in the same order as the residue serials.
291	duarte	796	*/
292	duarte	804	private double[][] getUpperBoundsAllPairs() {
293	duarte	796	double[][] upperBoundsMatrix = new double[conformationSize][conformationSize];
294	duarte	804	SparseGraph<Integer,SimpleEdge> upperBoundGraph = convertBoundsMatrixToUpperBoundGraph();
295	duarte	796	DijkstraDistance<Integer, SimpleEdge> dd = new DijkstraDistance<Integer, SimpleEdge>(upperBoundGraph,WeightTransformer);
296	duarte	802
297			for (int i=0;i<conformationSize;i++) {
298			for (int j=i+1;j<conformationSize;j++) {
299			upperBoundsMatrix[i][j] = dd.getDistance(i, j).doubleValue();
300	duarte	796	}
301			}
302			return upperBoundsMatrix;
303			}
304
305			/**
306	duarte	804	* Computes lower bounds for all pairs given a set of upper/lower bounds (sparse or full)
307			* based on the triangle inequality.
308	duarte	796	*
309	duarte	797	* NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
310			* weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Because of this the boundsDigraph result
311	duarte	804	* of calling {@link #convertBoundsMatrixToBoundsDigraph()}} have values offset with the maximum lower bound (lmax).
312	duarte	797	* Thus after computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
313			*
314	duarte	822	* @return a 2-dimensional array with the lower bounds for all pairs of residues, the
315			* indices of the array are guaranteed to be in the same order as the residue serials.
316	duarte	804	* @see {@link #convertBoundsMatrixToBoundsDigraph()} and {@link #getUpperBoundsAllPairs()}
317	duarte	796	*/
318	duarte	804	private double[][] getLowerBoundsAllPairs() {
319	duarte	797	double[][] lowerBoundsMatrix = new double[conformationSize][conformationSize];
320			// this is the bounds digraph as described by Crippen and Havel
321	duarte	804	SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = convertBoundsMatrixToBoundsDigraph();
322	duarte	797	DijkstraShortestPath<BoundsDigraphNode, SimpleEdge> dd = new DijkstraShortestPath<BoundsDigraphNode, SimpleEdge>(boundsDigraph,WeightTransformer);
323	duarte	802
324			for (int i=0;i<conformationSize;i++) {
325			for (int j=i+1;j<conformationSize;j++) {
326			int hops = dd.getPath(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)).size();
327			double lower = Math.abs(
328	duarte	797	(dd.getDistance(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
329	duarte	802	nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)
330			).doubleValue())
331	duarte	797	- (hopslmax)); // the lower limit for the triangle inequality is: Math.abs(shortestpath-(hopslmax))
332	duarte	802	lowerBoundsMatrix[i][j] = Math.max(lower, HARD_SPHERES_BOUND); // we only set the new lower bound to the one found if is above the HARD_SPHERES_BOUND
333	duarte	796	}
334			}
335	duarte	797	return lowerBoundsMatrix;
336	duarte	796
337			}
338
339			/**
340	duarte	802	* Converts the bounds matrix to a graph with only the upper bounds: nodes indices of bounds
341			* matrix, edges upper bounds (in SimpleEdge objects containing the upper bound value
342	duarte	796	* in their weight field).
343	duarte	802	* @param bounds
344	duarte	796	* @return
345			*/
346	duarte	804	private SparseGraph<Integer,SimpleEdge> convertBoundsMatrixToUpperBoundGraph() {
347	duarte	796	SparseGraph<Integer,SimpleEdge> upperBoundGraph = new SparseGraph<Integer, SimpleEdge>();
348	duarte	802	for (int i=0;i<conformationSize;i++) {
349			for (int j=0;j<conformationSize;j++) {
350			if (bounds[i][j]!=null) {
351			upperBoundGraph.addEdge(new SimpleEdge(bounds[i][j].upper), i, j, EdgeType.UNDIRECTED);
352			}
353			}
354	duarte	796	}
355			return upperBoundGraph;
356			}
357	duarte	797
358	duarte	796	/**
359	duarte	797	* Constructs a bounds digraph (as described by Crippen and Havel) to compute the triangle inequality limits
360			* for the lower bounds.
361			* The graph is composed by 2 subgraphs (we call them left and right) each of them containing the set of all atoms.
362			* Within the subgraphs there is an undirected edge between atoms i,j with weight the upper bounds for i,j
363			* Between the subgraphs there is a directed edge from left to right between atoms i(left) to j(right)
364			* with weight the negative of the lower bound i,j
365			* NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
366			* weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Thus we offset the values here to the maximum lower bound.
367			* After computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
368			* @return
369			*/
370	duarte	804	private SparseGraph<BoundsDigraphNode,SimpleEdge> convertBoundsMatrixToBoundsDigraph() {
371	duarte	797	// to do the offset thing (see docs above) we need to know first of all the max lower bound
372			ArrayList<Double> lowerBounds = new ArrayList<Double>();
373	duarte	802	for (int i=0;i<conformationSize;i++) {
374			for (int j=0;j<conformationSize;j++) {
375			if (bounds[i][j]!=null) {
376			lowerBounds.add(bounds[i][j].lower);
377			}
378			}
379	duarte	797	}
380			lmax = Collections.max(lowerBounds); // this is the offset value
381
382			SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = new SparseGraph<BoundsDigraphNode, SimpleEdge>();
383			// we have to store all nodes in a HashMap, so we can retrieve them by residue serial and side after
384			nodesBoundsDigraph = new HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>>();
385			nodesBoundsDigraph.put(BoundsDigraphNode.LEFT , new HashMap<Integer, BoundsDigraphNode>());
386			nodesBoundsDigraph.put(BoundsDigraphNode.RIGHT, new HashMap<Integer, BoundsDigraphNode>());
387			// first we create the nodes and store them into the HashMap
388	duarte	802	for (int i=0;i<conformationSize;i++) {
389	duarte	797	BoundsDigraphNode leftNode = new BoundsDigraphNode(i, BoundsDigraphNode.LEFT);
390			BoundsDigraphNode rightNode = new BoundsDigraphNode(i, BoundsDigraphNode.RIGHT);
391			boundsDigraph.addVertex(leftNode);
392			boundsDigraph.addVertex(rightNode);
393			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).put(i,leftNode);
394			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).put(i,rightNode);
395			}
396
397	duarte	802	for (int i=0;i<conformationSize;i++) {
398			for (int j=0;j<conformationSize;j++) {
399			if (bounds[i][j]!=null) {
400			// first we add the upper bounds as undirected edges to the 2 subgraphs (left and right)
401			boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
402			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
403			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(j),
404			EdgeType.UNDIRECTED);
405			boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
406			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(i),
407			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
408			EdgeType.UNDIRECTED);
409			// then we add the negative of the lower bounds as directed edges connecting nodes of subgraph left to subgraph right
410			boundsDigraph.addEdge(new SimpleEdge(lmax-bounds[i][j].lower),
411			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
412			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
413			EdgeType.DIRECTED);
414			}
415			}
416	duarte	797	}
417			return boundsDigraph;
418			}
419
420	duarte	821	protected void printBounds() {
421	duarte	804	printBounds(this.bounds);
422			}
423	duarte	802
424	duarte	804	/------------------------ statics ------------------------------/
425
426	duarte	802	/**
427			* Deep copies given array of bounds
428			* @param bounds
429			* @return
430			*/
431	duarte	821	protected static Bound[][] copyBounds(Bound[][] bounds) {
432	duarte	802	Bound[][] newBounds = new Bound[bounds.length][bounds.length];
433			for (int i=0;i<bounds.length;i++) {
434			for (int j=0;j<bounds[i].length;j++) {
435			if (bounds[i][j]!=null) {
436			newBounds[i][j] = new Bound(bounds[i][j].lower,bounds[i][j].upper);
437			}
438	duarte	799	}
439			}
440	duarte	802	return newBounds;
441	duarte	796	}
442	duarte	804
443	duarte	821	protected static void printBounds(Bound[][] bounds) {
444	duarte	802	for (int i=0;i<bounds.length;i++) {
445			for (int j=0;j<bounds[i].length;j++) {
446			if (bounds[i][j]==null) {
447			System.out.printf("%11s","null");
448			} else {
449			System.out.print(bounds[i][j]);
450			}
451			}
452			System.out.println();
453			}
454			System.out.println();
455			}
456
457	duarte	796
458			}