trunk/embed/BoundsSmoother.java

package embed;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Random;

import org.apache.commons.collections15.Transformer;

import Jama.Matrix;

import edu.uci.ics.jung.algorithms.shortestpath.DijkstraDistance;
import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath;
import edu.uci.ics.jung.graph.SparseGraph;
import edu.uci.ics.jung.graph.util.EdgeType;
import proteinstructure.AAinfo;

/**
 * Implementation of the bounds smoothing part of the EMBED algorithm of Crippen and Havel
 * Given a sparse set of distance ranges between a set of atoms it finds distance bounds for
 * all pairs of atoms, using the triangle inequality.
 *  
 * Taken from "Distance Geometry: Theory, Algorithms, and Chemical Applications" (section 3.1) by T.F. Havel, 
 * in Encyclopedia of Computational Chemistry (Wiley, New York, 1998). 
 * See also:
 *  "Distance Geometry and Molecular Conformation" (Chapter 5) by G.M. Crippen and T.F. Havel (Wiley)
 *  "Sampling and efficiency of metric matrix distance geometry: A novel partial
 *   metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
 *   
 * @author duarte
 *
 */
public class BoundsSmoother {
        
        private static final double HARD_SPHERES_BOUND = AAinfo.DIST_MIN_CA ;
        
        private static final boolean DEBUG = false;
        private static final long DEBUG_SEED = 123456;
        
        private static final int NUM_ROOTS_PARTIAL_METRIZATION = 4; // we choose 4 as in Kuszewski et al.

        /*----------------- helper classes and transformers -----------------*/
        
        Transformer<SimpleEdge, Number> WeightTransformer = new Transformer<SimpleEdge, Number>() {
                public Number transform(SimpleEdge input) {
                        return input.weight;
                }
        };
        
        private class SimpleEdge {
                public double weight;
                public SimpleEdge(double weight) {
                        this.weight = weight;
                }
        }
        
        private class BoundsDigraphNode {
                public static final boolean LEFT = true;
                public static final boolean RIGHT = false;
                private boolean side; // true: left, false: right
                private int resSerial;
                public BoundsDigraphNode(int resSerial, boolean side) {
                        this.resSerial = resSerial;
                        this.side = side;
                }
                public boolean isRight() {
                        return !side;
                }
                public boolean isLeft() {
                        return side;
                }
                public int getSerial() {
                        return resSerial;
                }
                public boolean equals(Object other) {
                        if (! (other instanceof BoundsDigraphNode)) return false;
                        BoundsDigraphNode otherNode = (BoundsDigraphNode)other;
                        if (otherNode.resSerial==this.resSerial && otherNode.side == this.side) {
                                return true;
                        } else {
                                return false;
                        }
                }
                public String toString() {
                        return resSerial+(side?"L":"R");
                }
        }

        /*---------------------- member variables ----------------------------*/
        
        private int conformationSize;
        private HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>> nodesBoundsDigraph; // map of serial/side to nodes in the bounds digraph
        private double lmax; // maximum of the lower bounds: offset value for the boundsDigraph (not to have negative weights so that we can use Dijkstra's algo)
        
        private Bound[][] bounds; // the bounds (half-)matrix with lower/upper bounds for all pairs (or with nulls for pairs without assigned bounds yet) 
        
        private Random rand; // the random generator for sampleBounds and metrize
        
        /*------------------------ constructors ------------------------------*/
        
        /**
         * Constructs a new BoundsSmoother object given a matrix of Bounds.
         * @param graph
         */
        public BoundsSmoother(Bound[][] bounds) {
                this.bounds = bounds;
                this.conformationSize = bounds.length;
        }
        
        /*----------------------- public methods  ----------------------------*/
        
        /**
         * Computes bounds for all pairs, based on the set of sparse distance ranges
         * The returned array is a new array not the reference to the internal bounds array.
         * @return a 2-dimensional array with the lower bounds for all pairs of residues, the 
         * indices of the array are guaranteed to be in the same order as the residue serials.
         */
        public Bound[][] getBoundsAllPairs() {
                computeTriangleInequality();
                return copyBounds(bounds); // we return a copy of the internal bounds array so that we can keep modifying it without side-effects to the returned reference
        }
                
        /**
         * Gets a random sample from the internal bounds array of all pairs of distance ranges
         * @return a symmetric metric matrix (both sides filled)
         * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs 
         */
        public Matrix sampleBounds() {
                return sampleBounds(true);
        }
        
        /**
         * Performs partial metrization for the internal bounds array.
         * The internal bounds array is updated with the new bounds after metrization.
         * The idea is that metrization doesn't need to be done for all atoms but only 
         * for a handful of them (called roots). This results in a much faster algorithm having 
         * almost the same sampling properties as full metrization.
         * See "Sampling and efficiency of metric matrix distance geometry: A novel partial
         * metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
         * @return
         */
        public Matrix metrize() {

                initSeed();
                
                ArrayList<Integer> roots = new ArrayList<Integer>();
                for (int count=1;count<=NUM_ROOTS_PARTIAL_METRIZATION;count++) {
                        int root = rand.nextInt(conformationSize);
                        if (roots.contains(root)) {
                                root = rand.nextInt(conformationSize);
                        }       
                        if (roots.contains(root)) { 
                                System.err.println("Warning: repeated root atom while doing metrization: "+root);
                        }
                        if (DEBUG) System.out.println("Picked root: "+root);                    
                        roots.add(root);
                        sampleBoundForRoot(root); // this alters directly the input bounds array
                        computeTriangleInequality();
                }
                
                // finally pick a value at random for all the other bounds
                return sampleBounds(false);             
        }
        
        /*----------------------- private methods  ---------------------------*/
        
        /**
         * Initialises (or resets the random seed).
         * If DEBUG flag is true the random seed will be a fixed value DEBUG_SEED
         */
        private void initSeed() {
                if (DEBUG) {
                        rand = new Random(DEBUG_SEED);
                } else {
                        rand = new Random();
                } 
        }
        
        /**
         * Gets a random sample from the internal bounds array of all pairs distance ranges
         * @param initSeed if true the random seed is reinitialised, if false the existing one will be used
         * @return a symmetric metric matrix (both sides filled)
         * @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
         */
        private Matrix sampleBounds(boolean initSeed) {
                if (initSeed) {
                        initSeed();
                }
                double[][] matrix = new double[conformationSize][conformationSize];
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (j>i) {
                                        matrix[i][j]= bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
                                } else if (j<i) {
                                        matrix[i][j]=matrix[j][i];
                                }
                        }
                }
                return new Matrix(matrix);
        }

        /**
         * For given root atom samples a value from the distance ranges of the root 
         * to all of its neighbours (updating the internal bounds array with the new sampled bounds)
         * @param root
         */
        private void sampleBoundForRoot(int root) {
                for (int neighb=0;neighb<conformationSize;neighb++) {
                        if (neighb==root) continue; // avoid the diagonal (which contains a null Bound)
                        int i = root;
                        int j = neighb;
                        if (neighb<root) {
                                i = neighb;
                                j = root;
                        }
                        double sampledValue = bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
                        bounds[i][j].lower = sampledValue;
                        bounds[i][j].upper = sampledValue;
                        if (DEBUG) System.out.print(bounds[i][j]);
                }
                if (DEBUG) System.out.println("\n");
        }
        
        /**
         * Computes bounds for all pairs through triangle inequalities from the bounds member variable
         * containing a set of lower/upper bounds (sparse or full)
         * The bounds array is modified to contain the new bounds. 
         * This is guaranteed to run in o(nmlog(n)), thus if the input is sparse then it's pretty fast: o(n2log(n)) but if 
         * the input is the full set of bounds then we have o(n3log(n)).
         */
        private void computeTriangleInequality() { 
                double MARGIN = 0.0001; // for comparing doubles we need some tolerance value

                double[][] lowerBounds = getLowerBoundsAllPairs();
                double[][] upperBounds = getUpperBoundsAllPairs();
                for (int i=0;i<conformationSize;i++) {
                        for (int j=i+1;j<conformationSize;j++) {
                                double upperBound = upperBounds[i][j];
                                double lowerBound = lowerBounds[i][j];
                                
                                if (bounds[i][j]!=null) {
                                        if (lowerBound>upperBound+MARGIN) {
                                                //System.err.println("old: "+sparseBounds[i][j]+" new: "+new Bound(lowerBound,upperBound));

                                                // During metrization sometimes a new upper bound is found that is below the new lower bound 
                                                // (actually in these cases the new lower bound coincides with the old one i.e. nothing new was 
                                                // found through triangle inequality for the lower bound).
                                                // For some reason it doesn't happen the other way around: a new lower bound found that is 
                                                // above the new (coinciding with old) upper bound. I suppose this is because the triangle inequality 
                                                // "is a lot more effective at reducing the upper bounds than increasing the lower bounds" (quoting Havel) 
                                                // To correct this we set both lower and upper to the newly found upper, i.e. we assume that 
                                                // the new upper bound is better because is in accordance to the triangle inequality 
                                                lowerBound=upperBound;

                                                //if (upperBound<sparseBounds[i][j].upper-MARGIN) 
                                                //      System.err.printf("new upper bound (%4.1f) for pair %3d %3d is smaller than old upper bound (%4.1f)\n",upperBound,i,j,sparseBounds[i][j].upper);
                                                //if (lowerBound>sparseBounds[i][j].lower+MARGIN) 
                                                //      System.err.printf("new lower bound (%4.1f) for pair %3d %3d is bigger than old lower bound (%4.1f)\n",lowerBound,i,j,sparseBounds[i][j].lower);
                                        }

                                        bounds[i][j].lower = lowerBound;
                                        bounds[i][j].upper = upperBound;
                                } else {
                                        bounds[i][j] = new Bound(lowerBound,upperBound);
                                }
                                // sanity check: lower bounds can't be bigger than upper bounds!
                                if (lowerBound>upperBound+MARGIN) {
                                        System.err.printf("Warning: lower bound (%4.1f) for pair "+i+" "+j+" is bigger than upper bound (%4.1f)\n",lowerBound,upperBound);
                                }
                        }
                }
        }

        /**
         * Computes upper bounds for all pairs from a set of upper bounds (sparse or full) based
         * on the triangle inequality.
         * The computation is actually just an all pairs shortest path using Dijkstra's 
         * shortest path algorithm (as the set of distance coming from contact maps is
         * very sparse this algorithm should be more efficient than Floyd's: Dijkstra's is
         * o(nm logn) and Floyd's is o(n3))
         * @return a 2-dimensional array with the lower bounds for all pairs of residues, the 
         * indices of the array are guaranteed to be in the same order as the residue serials.
         */
        private double[][] getUpperBoundsAllPairs() {
                double[][] upperBoundsMatrix = new double[conformationSize][conformationSize];
                SparseGraph<Integer,SimpleEdge> upperBoundGraph = convertBoundsMatrixToUpperBoundGraph();
                DijkstraDistance<Integer, SimpleEdge> dd = new DijkstraDistance<Integer, SimpleEdge>(upperBoundGraph,WeightTransformer);

                for (int i=0;i<conformationSize;i++) {
                        for (int j=i+1;j<conformationSize;j++) {
                                upperBoundsMatrix[i][j] = dd.getDistance(i, j).doubleValue();
                        }
                }
                return upperBoundsMatrix;
        }
        
        /**
         * Computes lower bounds for all pairs given a set of upper/lower bounds (sparse or full) 
         * based on the triangle inequality.
         * 
         * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative 
         * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Because of this the boundsDigraph result 
         * of calling {@link #convertBoundsMatrixToBoundsDigraph()}} have values offset with the maximum lower bound (lmax).
         * Thus after computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
         *  
         * @return a 2-dimensional array with the lower bounds for all pairs of residues, the 
         * indices of the array are guaranteed to be in the same order as the residue serials. 
         * @see {@link #convertBoundsMatrixToBoundsDigraph()} and {@link #getUpperBoundsAllPairs()}
         */
        private double[][] getLowerBoundsAllPairs() {
                double[][] lowerBoundsMatrix = new double[conformationSize][conformationSize];          
                // this is the bounds digraph as described by Crippen and Havel
                SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = convertBoundsMatrixToBoundsDigraph();
                DijkstraShortestPath<BoundsDigraphNode, SimpleEdge> dd = new DijkstraShortestPath<BoundsDigraphNode, SimpleEdge>(boundsDigraph,WeightTransformer);

                for (int i=0;i<conformationSize;i++) {
                        for (int j=i+1;j<conformationSize;j++) {
                                int hops = dd.getPath(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)).size();
                                double lower = Math.abs(
                                                (dd.getDistance(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), 
                                                                nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)
                                                ).doubleValue()) 
                                                - (hops*lmax)); // the lower limit for the triangle inequality is: Math.abs(shortestpath-(hops*lmax))
                                lowerBoundsMatrix[i][j] = Math.max(lower, HARD_SPHERES_BOUND); // we only set the new lower bound to the one found if is above the HARD_SPHERES_BOUND
                        }
                }
                return lowerBoundsMatrix;
                
        }

        /**
         * Converts the bounds matrix to a graph with only the upper bounds: nodes indices of bounds 
         * matrix, edges upper bounds (in SimpleEdge objects containing the upper bound value 
         * in their weight field).
         * @param bounds
         * @return
         */
        private SparseGraph<Integer,SimpleEdge> convertBoundsMatrixToUpperBoundGraph() {
                SparseGraph<Integer,SimpleEdge> upperBoundGraph = new SparseGraph<Integer, SimpleEdge>();
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (bounds[i][j]!=null) {
                                        upperBoundGraph.addEdge(new SimpleEdge(bounds[i][j].upper), i, j, EdgeType.UNDIRECTED);
                                }
                        }
                }
                return upperBoundGraph;
        }

        /**
         * Constructs a bounds digraph (as described by Crippen and Havel) to compute the triangle inequality limits
         * for the lower bounds.
         * The graph is composed by 2 subgraphs (we call them left and right) each of them containing the set of all atoms. 
         * Within the subgraphs there is an undirected edge between atoms i,j with weight the upper bounds for i,j
         * Between the subgraphs there is a directed edge from left to right between atoms i(left) to j(right) 
         * with weight the negative of the lower bound i,j
         * NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative 
         * weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Thus we offset the values here to the maximum lower bound.
         * After computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
         * @return
         */
        private SparseGraph<BoundsDigraphNode,SimpleEdge> convertBoundsMatrixToBoundsDigraph() {
                // to do the offset thing (see docs above) we need to know first of all the max lower bound
                ArrayList<Double> lowerBounds = new ArrayList<Double>();
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (bounds[i][j]!=null) {
                                        lowerBounds.add(bounds[i][j].lower);
                                }
                        }
                }
                lmax = Collections.max(lowerBounds); // this is the offset value
                
                SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = new SparseGraph<BoundsDigraphNode, SimpleEdge>();
                // we have to store all nodes in a HashMap, so we can retrieve them by residue serial and side after 
                nodesBoundsDigraph = new HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>>();
                nodesBoundsDigraph.put(BoundsDigraphNode.LEFT , new HashMap<Integer, BoundsDigraphNode>());
                nodesBoundsDigraph.put(BoundsDigraphNode.RIGHT, new HashMap<Integer, BoundsDigraphNode>());
                // first we create the nodes and store them into the HashMap
                for (int i=0;i<conformationSize;i++) {
                        BoundsDigraphNode leftNode = new BoundsDigraphNode(i, BoundsDigraphNode.LEFT);
                        BoundsDigraphNode rightNode = new BoundsDigraphNode(i, BoundsDigraphNode.RIGHT);
                        boundsDigraph.addVertex(leftNode);
                        boundsDigraph.addVertex(rightNode);
                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).put(i,leftNode);
                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).put(i,rightNode);
                }
                
                for (int i=0;i<conformationSize;i++) {
                        for (int j=0;j<conformationSize;j++) {
                                if (bounds[i][j]!=null) {
                                        // first we add the upper bounds as undirected edges to the 2 subgraphs (left and right)
                                        boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(j), 
                                                        EdgeType.UNDIRECTED);
                                        boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(i), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j), 
                                                        EdgeType.UNDIRECTED);
                                        // then we add the negative of the lower bounds as directed edges connecting nodes of subgraph left to subgraph right
                                        boundsDigraph.addEdge(new SimpleEdge(lmax-bounds[i][j].lower), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), 
                                                        nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j), 
                                                        EdgeType.DIRECTED);             
                                }
                        }
                }
                return boundsDigraph;
        }

        protected void printBounds() {
                printBounds(this.bounds);
        }
        
        /*------------------------ statics  ------------------------------*/

        /**
         * Deep copies given array of bounds
         * @param bounds
         * @return
         */
        protected static Bound[][] copyBounds(Bound[][] bounds) {
                Bound[][] newBounds = new Bound[bounds.length][bounds.length];
                for (int i=0;i<bounds.length;i++) {
                        for (int j=0;j<bounds[i].length;j++) {
                                if (bounds[i][j]!=null) {
                                        newBounds[i][j] = new Bound(bounds[i][j].lower,bounds[i][j].upper);
                                }
                        }
                }
                return newBounds;
        }

        protected static void printBounds(Bound[][] bounds) {
                for (int i=0;i<bounds.length;i++) {
                        for (int j=0;j<bounds[i].length;j++) {
                                if (bounds[i][j]==null) {
                                        System.out.printf("%11s","null");
                                } else {
                                        System.out.print(bounds[i][j]);
                                }
                        }
                        System.out.println();
                }
                System.out.println();
        }

        
}
Revision:	822
Committed:	Wed Jan 14 16:08:19 2009 UTC (15 years, 8 months ago) by duarte
File size:	18606 byte(s)
Log Message:	Now Reconstructer can also take RIGraphs with "unobserved" residues (i.e. with missing nodes). Model pdb files that Reconstructer creates now contain strictly only CA atoms.
Line	User	Rev	File contents
1	duarte	796	package embed;
2
3	duarte	797	import java.util.ArrayList;
4			import java.util.Collections;
5			import java.util.HashMap;
6	duarte	796	import java.util.Random;
7
8			import org.apache.commons.collections15.Transformer;
9
10			import Jama.Matrix;
11
12			import edu.uci.ics.jung.algorithms.shortestpath.DijkstraDistance;
13	duarte	797	import edu.uci.ics.jung.algorithms.shortestpath.DijkstraShortestPath;
14	duarte	796	import edu.uci.ics.jung.graph.SparseGraph;
15			import edu.uci.ics.jung.graph.util.EdgeType;
16			import proteinstructure.AAinfo;
17
18			/**
19			* Implementation of the bounds smoothing part of the EMBED algorithm of Crippen and Havel
20			* Given a sparse set of distance ranges between a set of atoms it finds distance bounds for
21			* all pairs of atoms, using the triangle inequality.
22			*
23			* Taken from "Distance Geometry: Theory, Algorithms, and Chemical Applications" (section 3.1) by T.F. Havel,
24			* in Encyclopedia of Computational Chemistry (Wiley, New York, 1998).
25	duarte	804	* See also:
26			* "Distance Geometry and Molecular Conformation" (Chapter 5) by G.M. Crippen and T.F. Havel (Wiley)
27			* "Sampling and efficiency of metric matrix distance geometry: A novel partial
28			* metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
29	duarte	796	*
30			* @author duarte
31			*
32			*/
33			public class BoundsSmoother {
34
35	duarte	799	private static final double HARD_SPHERES_BOUND = AAinfo.DIST_MIN_CA ;
36	duarte	800
37			private static final boolean DEBUG = false;
38	duarte	802	private static final long DEBUG_SEED = 123456;
39
40	duarte	804	private static final int NUM_ROOTS_PARTIAL_METRIZATION = 4; // we choose 4 as in Kuszewski et al.
41	duarte	796
42			/----------------- helper classes and transformers -----------------/
43
44	duarte	797	Transformer<SimpleEdge, Number> WeightTransformer = new Transformer<SimpleEdge, Number>() {
45			public Number transform(SimpleEdge input) {
46	duarte	796	return input.weight;
47			}
48			};
49
50			private class SimpleEdge {
51			public double weight;
52			public SimpleEdge(double weight) {
53			this.weight = weight;
54			}
55			}
56	duarte	797
57			private class BoundsDigraphNode {
58			public static final boolean LEFT = true;
59			public static final boolean RIGHT = false;
60			private boolean side; // true: left, false: right
61			private int resSerial;
62			public BoundsDigraphNode(int resSerial, boolean side) {
63			this.resSerial = resSerial;
64			this.side = side;
65			}
66			public boolean isRight() {
67			return !side;
68			}
69			public boolean isLeft() {
70			return side;
71			}
72			public int getSerial() {
73			return resSerial;
74			}
75			public boolean equals(Object other) {
76			if (! (other instanceof BoundsDigraphNode)) return false;
77			BoundsDigraphNode otherNode = (BoundsDigraphNode)other;
78			if (otherNode.resSerial==this.resSerial && otherNode.side == this.side) {
79			return true;
80			} else {
81			return false;
82			}
83			}
84			public String toString() {
85			return resSerial+(side?"L":"R");
86			}
87			}
88	duarte	796
89			/---------------------- member variables ----------------------------/
90
91			private int conformationSize;
92	duarte	797	private HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>> nodesBoundsDigraph; // map of serial/side to nodes in the bounds digraph
93			private double lmax; // maximum of the lower bounds: offset value for the boundsDigraph (not to have negative weights so that we can use Dijkstra's algo)
94	duarte	796
95	duarte	804	private Bound[][] bounds; // the bounds (half-)matrix with lower/upper bounds for all pairs (or with nulls for pairs without assigned bounds yet)
96
97	duarte	802	private Random rand; // the random generator for sampleBounds and metrize
98
99	duarte	796	/------------------------ constructors ------------------------------/
100
101			/**
102	duarte	822	* Constructs a new BoundsSmoother object given a matrix of Bounds.
103	duarte	796	* @param graph
104			*/
105	duarte	821	public BoundsSmoother(Bound[][] bounds) {
106			this.bounds = bounds;
107			this.conformationSize = bounds.length;
108	duarte	796	}
109
110			/----------------------- public methods ----------------------------/
111
112			/**
113			* Computes bounds for all pairs, based on the set of sparse distance ranges
114	duarte	804	* The returned array is a new array not the reference to the internal bounds array.
115	duarte	822	* @return a 2-dimensional array with the lower bounds for all pairs of residues, the
116			* indices of the array are guaranteed to be in the same order as the residue serials.
117	duarte	796	*/
118			public Bound[][] getBoundsAllPairs() {
119	duarte	804	computeTriangleInequality();
120			return copyBounds(bounds); // we return a copy of the internal bounds array so that we can keep modifying it without side-effects to the returned reference
121	duarte	796	}
122	duarte	798
123	duarte	796	/**
124	duarte	821	* Gets a random sample from the internal bounds array of all pairs of distance ranges
125			* @return a symmetric metric matrix (both sides filled)
126			* @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
127	duarte	796	*/
128	duarte	804	public Matrix sampleBounds() {
129	duarte	821	return sampleBounds(true);
130	duarte	796	}
131	duarte	821
132	duarte	803	/**
133	duarte	804	* Performs partial metrization for the internal bounds array.
134			* The internal bounds array is updated with the new bounds after metrization.
135			* The idea is that metrization doesn't need to be done for all atoms but only
136			* for a handful of them (called roots). This results in a much faster algorithm having
137			* almost the same sampling properties as full metrization.
138			* See "Sampling and efficiency of metric matrix distance geometry: A novel partial
139			* metrization algorithm", Kuszewski J, Nilges M, Bruenger AT, 1992, Journal of Biomolecular NMR
140	duarte	803	* @return
141			*/
142	duarte	804	public Matrix metrize() {
143	duarte	803
144	duarte	821	initSeed();
145
146	duarte	802	ArrayList<Integer> roots = new ArrayList<Integer>();
147			for (int count=1;count<=NUM_ROOTS_PARTIAL_METRIZATION;count++) {
148			int root = rand.nextInt(conformationSize);
149	duarte	804	if (roots.contains(root)) {
150			root = rand.nextInt(conformationSize);
151			}
152			if (roots.contains(root)) {
153			System.err.println("Warning: repeated root atom while doing metrization: "+root);
154			}
155	duarte	803	if (DEBUG) System.out.println("Picked root: "+root);
156	duarte	802	roots.add(root);
157	duarte	804	sampleBoundForRoot(root); // this alters directly the input bounds array
158			computeTriangleInequality();
159	duarte	802	}
160
161	duarte	803	// finally pick a value at random for all the other bounds
162	duarte	821	return sampleBounds(false);
163	duarte	802	}
164
165	duarte	821	/----------------------- private methods ---------------------------/
166
167	duarte	796	/**
168	duarte	821	* Initialises (or resets the random seed).
169			* If DEBUG flag is true the random seed will be a fixed value DEBUG_SEED
170	duarte	796	*/
171	duarte	821	private void initSeed() {
172			if (DEBUG) {
173			rand = new Random(DEBUG_SEED);
174			} else {
175			rand = new Random();
176			}
177	duarte	796	}
178
179	duarte	804	/**
180	duarte	821	* Gets a random sample from the internal bounds array of all pairs distance ranges
181			* @param initSeed if true the random seed is reinitialised, if false the existing one will be used
182			* @return a symmetric metric matrix (both sides filled)
183			* @throws NullPointerException if the internal bounds array doesn't contain bounds for all pairs
184			*/
185			private Matrix sampleBounds(boolean initSeed) {
186			if (initSeed) {
187			initSeed();
188			}
189			double[][] matrix = new double[conformationSize][conformationSize];
190			for (int i=0;i<conformationSize;i++) {
191			for (int j=0;j<conformationSize;j++) {
192			if (j>i) {
193			matrix[i][j]= bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
194			} else if (j<i) {
195			matrix[i][j]=matrix[j][i];
196			}
197			}
198			}
199			return new Matrix(matrix);
200			}
201
202			/**
203	duarte	804	* For given root atom samples a value from the distance ranges of the root
204			* to all of its neighbours (updating the internal bounds array with the new sampled bounds)
205			* @param root
206			*/
207			private void sampleBoundForRoot(int root) {
208	duarte	803	for (int neighb=0;neighb<conformationSize;neighb++) {
209			if (neighb==root) continue; // avoid the diagonal (which contains a null Bound)
210	duarte	802	int i = root;
211	duarte	803	int j = neighb;
212			if (neighb<root) {
213			i = neighb;
214	duarte	802	j = root;
215			}
216			double sampledValue = bounds[i][j].lower+rand.nextDouble()*(bounds[i][j].upper-bounds[i][j].lower);
217	duarte	803	bounds[i][j].lower = sampledValue;
218			bounds[i][j].upper = sampledValue;
219			if (DEBUG) System.out.print(bounds[i][j]);
220	duarte	802	}
221	duarte	803	if (DEBUG) System.out.println("\n");
222	duarte	802	}
223
224	duarte	796	/**
225	duarte	804	* Computes bounds for all pairs through triangle inequalities from the bounds member variable
226			* containing a set of lower/upper bounds (sparse or full)
227			* The bounds array is modified to contain the new bounds.
228			* This is guaranteed to run in o(nmlog(n)), thus if the input is sparse then it's pretty fast: o(n2log(n)) but if
229			* the input is the full set of bounds then we have o(n3log(n)).
230	duarte	798	*/
231	duarte	804	private void computeTriangleInequality() {
232	duarte	803	double MARGIN = 0.0001; // for comparing doubles we need some tolerance value
233	duarte	804
234			double[][] lowerBounds = getLowerBoundsAllPairs();
235			double[][] upperBounds = getUpperBoundsAllPairs();
236	duarte	802	for (int i=0;i<conformationSize;i++) {
237			for (int j=i+1;j<conformationSize;j++) {
238	duarte	798	double upperBound = upperBounds[i][j];
239			double lowerBound = lowerBounds[i][j];
240	duarte	804
241			if (bounds[i][j]!=null) {
242			if (lowerBound>upperBound+MARGIN) {
243			//System.err.println("old: "+sparseBounds[i][j]+" new: "+new Bound(lowerBound,upperBound));
244
245			// During metrization sometimes a new upper bound is found that is below the new lower bound
246			// (actually in these cases the new lower bound coincides with the old one i.e. nothing new was
247			// found through triangle inequality for the lower bound).
248			// For some reason it doesn't happen the other way around: a new lower bound found that is
249			// above the new (coinciding with old) upper bound. I suppose this is because the triangle inequality
250			// "is a lot more effective at reducing the upper bounds than increasing the lower bounds" (quoting Havel)
251			// To correct this we set both lower and upper to the newly found upper, i.e. we assume that
252			// the new upper bound is better because is in accordance to the triangle inequality
253			lowerBound=upperBound;
254
255			//if (upperBound<sparseBounds[i][j].upper-MARGIN)
256			// System.err.printf("new upper bound (%4.1f) for pair %3d %3d is smaller than old upper bound (%4.1f)\n",upperBound,i,j,sparseBounds[i][j].upper);
257			//if (lowerBound>sparseBounds[i][j].lower+MARGIN)
258			// System.err.printf("new lower bound (%4.1f) for pair %3d %3d is bigger than old lower bound (%4.1f)\n",lowerBound,i,j,sparseBounds[i][j].lower);
259			}
260
261			bounds[i][j].lower = lowerBound;
262			bounds[i][j].upper = upperBound;
263			} else {
264			bounds[i][j] = new Bound(lowerBound,upperBound);
265	duarte	803	}
266	duarte	802	// sanity check: lower bounds can't be bigger than upper bounds!
267	duarte	803	if (lowerBound>upperBound+MARGIN) {
268	duarte	802	System.err.printf("Warning: lower bound (%4.1f) for pair "+i+" "+j+" is bigger than upper bound (%4.1f)\n",lowerBound,upperBound);
269	duarte	798	}
270			}
271			}
272			}
273
274			/**
275	duarte	804	* Computes upper bounds for all pairs from a set of upper bounds (sparse or full) based
276	duarte	796	* on the triangle inequality.
277			* The computation is actually just an all pairs shortest path using Dijkstra's
278			* shortest path algorithm (as the set of distance coming from contact maps is
279			* very sparse this algorithm should be more efficient than Floyd's: Dijkstra's is
280			* o(nm logn) and Floyd's is o(n3))
281	duarte	822	* @return a 2-dimensional array with the lower bounds for all pairs of residues, the
282			* indices of the array are guaranteed to be in the same order as the residue serials.
283	duarte	796	*/
284	duarte	804	private double[][] getUpperBoundsAllPairs() {
285	duarte	796	double[][] upperBoundsMatrix = new double[conformationSize][conformationSize];
286	duarte	804	SparseGraph<Integer,SimpleEdge> upperBoundGraph = convertBoundsMatrixToUpperBoundGraph();
287	duarte	796	DijkstraDistance<Integer, SimpleEdge> dd = new DijkstraDistance<Integer, SimpleEdge>(upperBoundGraph,WeightTransformer);
288	duarte	802
289			for (int i=0;i<conformationSize;i++) {
290			for (int j=i+1;j<conformationSize;j++) {
291			upperBoundsMatrix[i][j] = dd.getDistance(i, j).doubleValue();
292	duarte	796	}
293			}
294			return upperBoundsMatrix;
295			}
296
297			/**
298	duarte	804	* Computes lower bounds for all pairs given a set of upper/lower bounds (sparse or full)
299			* based on the triangle inequality.
300	duarte	796	*
301	duarte	797	* NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
302			* weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Because of this the boundsDigraph result
303	duarte	804	* of calling {@link #convertBoundsMatrixToBoundsDigraph()}} have values offset with the maximum lower bound (lmax).
304	duarte	797	* Thus after computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
305			*
306	duarte	822	* @return a 2-dimensional array with the lower bounds for all pairs of residues, the
307			* indices of the array are guaranteed to be in the same order as the residue serials.
308	duarte	804	* @see {@link #convertBoundsMatrixToBoundsDigraph()} and {@link #getUpperBoundsAllPairs()}
309	duarte	796	*/
310	duarte	804	private double[][] getLowerBoundsAllPairs() {
311	duarte	797	double[][] lowerBoundsMatrix = new double[conformationSize][conformationSize];
312			// this is the bounds digraph as described by Crippen and Havel
313	duarte	804	SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = convertBoundsMatrixToBoundsDigraph();
314	duarte	797	DijkstraShortestPath<BoundsDigraphNode, SimpleEdge> dd = new DijkstraShortestPath<BoundsDigraphNode, SimpleEdge>(boundsDigraph,WeightTransformer);
315	duarte	802
316			for (int i=0;i<conformationSize;i++) {
317			for (int j=i+1;j<conformationSize;j++) {
318			int hops = dd.getPath(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i), nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)).size();
319			double lower = Math.abs(
320	duarte	797	(dd.getDistance(nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
321	duarte	802	nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j)
322			).doubleValue())
323	duarte	797	- (hopslmax)); // the lower limit for the triangle inequality is: Math.abs(shortestpath-(hopslmax))
324	duarte	802	lowerBoundsMatrix[i][j] = Math.max(lower, HARD_SPHERES_BOUND); // we only set the new lower bound to the one found if is above the HARD_SPHERES_BOUND
325	duarte	796	}
326			}
327	duarte	797	return lowerBoundsMatrix;
328	duarte	796
329			}
330
331			/**
332	duarte	802	* Converts the bounds matrix to a graph with only the upper bounds: nodes indices of bounds
333			* matrix, edges upper bounds (in SimpleEdge objects containing the upper bound value
334	duarte	796	* in their weight field).
335	duarte	802	* @param bounds
336	duarte	796	* @return
337			*/
338	duarte	804	private SparseGraph<Integer,SimpleEdge> convertBoundsMatrixToUpperBoundGraph() {
339	duarte	796	SparseGraph<Integer,SimpleEdge> upperBoundGraph = new SparseGraph<Integer, SimpleEdge>();
340	duarte	802	for (int i=0;i<conformationSize;i++) {
341			for (int j=0;j<conformationSize;j++) {
342			if (bounds[i][j]!=null) {
343			upperBoundGraph.addEdge(new SimpleEdge(bounds[i][j].upper), i, j, EdgeType.UNDIRECTED);
344			}
345			}
346	duarte	796	}
347			return upperBoundGraph;
348			}
349	duarte	797
350	duarte	796	/**
351	duarte	797	* Constructs a bounds digraph (as described by Crippen and Havel) to compute the triangle inequality limits
352			* for the lower bounds.
353			* The graph is composed by 2 subgraphs (we call them left and right) each of them containing the set of all atoms.
354			* Within the subgraphs there is an undirected edge between atoms i,j with weight the upper bounds for i,j
355			* Between the subgraphs there is a directed edge from left to right between atoms i(left) to j(right)
356			* with weight the negative of the lower bound i,j
357			* NOTE that because we use Dijkstra's algorithm for the computation of the shortest paths, we can't use negative
358			* weights (see http://en.wikipedia.org/wiki/Dijkstra%27s_algorithm). Thus we offset the values here to the maximum lower bound.
359			* After computing the shortest paths we have to revert back that offset by counting the number of hops the shortest path has.
360			* @return
361			*/
362	duarte	804	private SparseGraph<BoundsDigraphNode,SimpleEdge> convertBoundsMatrixToBoundsDigraph() {
363	duarte	797	// to do the offset thing (see docs above) we need to know first of all the max lower bound
364			ArrayList<Double> lowerBounds = new ArrayList<Double>();
365	duarte	802	for (int i=0;i<conformationSize;i++) {
366			for (int j=0;j<conformationSize;j++) {
367			if (bounds[i][j]!=null) {
368			lowerBounds.add(bounds[i][j].lower);
369			}
370			}
371	duarte	797	}
372			lmax = Collections.max(lowerBounds); // this is the offset value
373
374			SparseGraph<BoundsDigraphNode,SimpleEdge> boundsDigraph = new SparseGraph<BoundsDigraphNode, SimpleEdge>();
375			// we have to store all nodes in a HashMap, so we can retrieve them by residue serial and side after
376			nodesBoundsDigraph = new HashMap<Boolean, HashMap<Integer,BoundsDigraphNode>>();
377			nodesBoundsDigraph.put(BoundsDigraphNode.LEFT , new HashMap<Integer, BoundsDigraphNode>());
378			nodesBoundsDigraph.put(BoundsDigraphNode.RIGHT, new HashMap<Integer, BoundsDigraphNode>());
379			// first we create the nodes and store them into the HashMap
380	duarte	802	for (int i=0;i<conformationSize;i++) {
381	duarte	797	BoundsDigraphNode leftNode = new BoundsDigraphNode(i, BoundsDigraphNode.LEFT);
382			BoundsDigraphNode rightNode = new BoundsDigraphNode(i, BoundsDigraphNode.RIGHT);
383			boundsDigraph.addVertex(leftNode);
384			boundsDigraph.addVertex(rightNode);
385			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).put(i,leftNode);
386			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).put(i,rightNode);
387			}
388
389	duarte	802	for (int i=0;i<conformationSize;i++) {
390			for (int j=0;j<conformationSize;j++) {
391			if (bounds[i][j]!=null) {
392			// first we add the upper bounds as undirected edges to the 2 subgraphs (left and right)
393			boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
394			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
395			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(j),
396			EdgeType.UNDIRECTED);
397			boundsDigraph.addEdge(new SimpleEdge(lmax+bounds[i][j].upper),
398			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(i),
399			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
400			EdgeType.UNDIRECTED);
401			// then we add the negative of the lower bounds as directed edges connecting nodes of subgraph left to subgraph right
402			boundsDigraph.addEdge(new SimpleEdge(lmax-bounds[i][j].lower),
403			nodesBoundsDigraph.get(BoundsDigraphNode.LEFT).get(i),
404			nodesBoundsDigraph.get(BoundsDigraphNode.RIGHT).get(j),
405			EdgeType.DIRECTED);
406			}
407			}
408	duarte	797	}
409			return boundsDigraph;
410			}
411
412	duarte	821	protected void printBounds() {
413	duarte	804	printBounds(this.bounds);
414			}
415	duarte	802
416	duarte	804	/------------------------ statics ------------------------------/
417
418	duarte	802	/**
419			* Deep copies given array of bounds
420			* @param bounds
421			* @return
422			*/
423	duarte	821	protected static Bound[][] copyBounds(Bound[][] bounds) {
424	duarte	802	Bound[][] newBounds = new Bound[bounds.length][bounds.length];
425			for (int i=0;i<bounds.length;i++) {
426			for (int j=0;j<bounds[i].length;j++) {
427			if (bounds[i][j]!=null) {
428			newBounds[i][j] = new Bound(bounds[i][j].lower,bounds[i][j].upper);
429			}
430	duarte	799	}
431			}
432	duarte	802	return newBounds;
433	duarte	796	}
434	duarte	804
435	duarte	821	protected static void printBounds(Bound[][] bounds) {
436	duarte	802	for (int i=0;i<bounds.length;i++) {
437			for (int j=0;j<bounds[i].length;j++) {
438			if (bounds[i][j]==null) {
439			System.out.printf("%11s","null");
440			} else {
441			System.out.print(bounds[i][j]);
442			}
443			}
444			System.out.println();
445			}
446			System.out.println();
447			}
448
449	duarte	796
450			}