ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Graph.java
Revision: 159
Committed: Thu May 24 09:37:25 2007 UTC (17 years, 4 months ago) by duarte
File size: 15874 byte(s)
Log Message:
Moved ContactMap and EdgeState out of this repository. 
Extra functionality of ContactMap now in Graph:
- new member variables fullLength, obsLength, numContacts. New method getIntMatrix
- removed getCM method
Line User Rev File contents
1 duarte 123 package proteinstructure;
2 duarte 134 import java.io.BufferedReader;
3     import java.io.File;
4     import java.io.FileNotFoundException;
5 duarte 123 import java.io.FileOutputStream;
6 duarte 134 import java.io.FileReader;
7 duarte 123 import java.io.PrintStream;
8     import java.io.IOException;
9 duarte 135 import java.sql.ResultSet;
10     import java.sql.SQLException;
11     import java.sql.Statement;
12 duarte 123 import java.util.ArrayList;
13 duarte 159 import java.util.Collections;
14 duarte 129 import java.util.TreeMap;
15 duarte 144 import java.util.regex.Matcher;
16     import java.util.regex.Pattern;
17 duarte 135 import tools.MySQLConnection;
18 duarte 123
19    
20     public class Graph {
21    
22 duarte 135 public final static String MYSQLSERVER="white";
23     public final static String MYSQLUSER=getUserName();
24     public final static String MYSQLPWD="nieve";
25 duarte 144
26     public final static String GRAPHFILEFORMATVERSION = "1.0";
27 duarte 135
28 duarte 123 ArrayList<Contact> contacts;
29 duarte 135 // nodes is a TreeMap of residue serials to residue types (3 letter code)
30 duarte 129 TreeMap<Integer,String> nodes;
31 duarte 159 public String sequence; // the full sequence (with unobserved residues and non-standard aas ='X')
32 duarte 143 public String accode;
33     public String chain;
34 duarte 159 public String chaincode="";
35 duarte 146 public double cutoff;
36     public String ct;
37 duarte 135 boolean directed=false;
38 duarte 123
39 duarte 159 // fullLength is length of full sequence or:
40     // -if sequence not provided (when reading from db): length of everything except possible unobserved residues at end of chain
41     // -if sequence and nodes not provided (when reading from file and sequence field missing): length except possible unobserved residues at end of chain and possible nodes without contacts at end of chain
42     public int fullLength;
43     public int obsLength; // length without unobserved, non standard aas
44    
45     public int numContacts;
46    
47 duarte 135 // these 2 fields only used when reading from db
48     int graphid=0;
49     int sm_id=0;
50    
51 duarte 134 /**
52     * Constructs Graph object by passing ArrayList with contacts and TreeMap with nodes (res serials and types)
53     * Must also pass contact type, cutoff, accession code and chain
54     * @param contacts
55     * @param nodes
56     * @param sequence
57     * @param cutoff
58     * @param ct
59     * @param accode
60     * @param chain
61     */
62 duarte 156 public Graph (ArrayList<Contact> contacts, TreeMap<Integer,String> nodes, String sequence, double cutoff,String ct, String accode, String chain, String chaincode) {
63 duarte 123 this.contacts=contacts;
64     this.cutoff=cutoff;
65 duarte 129 this.nodes=nodes;
66     this.sequence=sequence;
67     this.accode=accode;
68     this.chain=chain;
69 duarte 156 this.chaincode=chaincode;
70 duarte 123 this.ct=ct;
71 duarte 159 this.fullLength=sequence.length();
72     this.obsLength=nodes.size();
73     this.numContacts=contacts.size();
74 duarte 129 if (ct.contains("/")){
75     directed=true;
76     }
77 duarte 123 }
78 duarte 135
79     /**
80 duarte 141 * Constructs Graph object from graph db, given the dbname, accode, chaincode (classic pdb chain code), ct and cutoff
81 duarte 135 * @param dbname
82     * @param accode
83 duarte 141 * @param chaincode
84 duarte 135 * @param cutoff
85     * @param ct
86     */
87 duarte 141 public Graph(String dbname, String accode, String chaincode, double cutoff, String ct) throws GraphIdNotFoundError{
88 duarte 135 this.cutoff=cutoff;
89     this.accode=accode;
90     this.ct=ct;
91 duarte 142 // we set the sequence to empty when we read from graph db. We don't have the full sequence in graph db
92     // when we pass the sequence in getCM to the ContactMap constructor we want to have either a full sequence (with unobserveds) or a blank in case we don't have the info
93     this.sequence="";
94 duarte 135 //TODO graphs in db are never directed, so this doesn't really apply here. Must solve all this!
95     if (ct.contains("/")){
96     directed=true;
97     }
98     MySQLConnection conn = new MySQLConnection(MYSQLSERVER,MYSQLUSER,MYSQLPWD,dbname);
99 duarte 141 getgraphid(conn, chaincode); // initialises graphid, sm_id and chain
100     read_graph_from_db(conn); // gets contacts, nodes and sequence
101 duarte 135 conn.close();
102 duarte 159 this.obsLength=nodes.size();
103     if (!sequence.equals("")){
104     this.fullLength=sequence.length();
105     } else {
106     // if nodes TreeMap has correct residue numbering then this should get the right full length,
107     // we will only miss: gaps (unobserved residues) at the end of the sequence. Those we can't know unless full sequence is given
108     this.fullLength=Collections.max(nodes.keySet());
109     }
110     this.numContacts=contacts.size();
111 duarte 135 }
112 duarte 152
113     /**
114     * Constructs Graph object from graph db, given the graphid
115     * @param dbname
116     * @param graphid
117     */
118     public Graph(String dbname,int graphid) throws GraphIdNotFoundError{
119     this.graphid=graphid;
120     // we set the sequence to empty when we read from graph db. We don't have the full sequence in graph db
121     // when we pass the sequence in getCM to the ContactMap constructor we want to have either a full sequence (with unobserveds) or a blank in case we don't have the info
122     this.sequence="";
123     MySQLConnection conn = new MySQLConnection(MYSQLSERVER,MYSQLUSER,MYSQLPWD,dbname);
124     read_graph_from_db(conn); // gets contacts, nodes and sequence
125     get_db_graph_info(conn); // gets accode, chaincode, chain, ct and cutoff from db (from graph_id)
126     conn.close();
127     //TODO graphs in db are never directed, so this doesn't really apply here. Must solve all this!
128     if (ct.contains("/")){
129     directed=true;
130     }
131 duarte 159 this.obsLength=nodes.size();
132     if (!sequence.equals("")){
133     this.fullLength=sequence.length();
134     } else {
135     // if nodes TreeMap has correct residue numbering then this should get the right full length,
136     // we will only miss: gaps (unobserved residues) at the end of the sequence. Those we can't know unless full sequence is given
137     this.fullLength=Collections.max(nodes.keySet());
138     }
139     this.numContacts=contacts.size();
140 duarte 152 }
141 duarte 129
142 duarte 134 /**
143     * Constructs Graph object by reading a file with contacts
144 duarte 144 * If the contacts file doesn't have the sequence then the graph object won't have sequence or nodes
145     * That means it won't be possible to get a ContactMap from it using getCM because CM needs both sequence and nodes
146 duarte 134 * @param contactsfile
147     * @throws IOException
148     * @throws FileNotFoundException
149     */
150 duarte 144 public Graph (String contactsfile) throws IOException, FileNotFoundException{
151 duarte 142 // we set the sequence to blank when we read from file as we don't have the full sequence
152 duarte 144 // if sequence is present in contactsfile then is read from there
153 duarte 142 this.sequence="";
154 duarte 144 this.ct="";
155     this.cutoff=0.0;
156 duarte 155 // we initialise accode, chain and chaincode to empty strings in case the file doesn't specify then
157     this.accode="";
158     this.chain="";
159     this.chaincode="";
160 duarte 134 if (ct.contains("/")){
161     directed=true;
162     }
163 duarte 159 read_graph_from_file(contactsfile); // initialises contacts, and nodes (only if sequence is given)
164     if (!sequence.equals("")){
165     this.fullLength=sequence.length();
166     this.obsLength=nodes.size();
167     } else {
168     // if contacts have correct residue numbering then this should get the right full length up to the maximum node that makes a contact,
169     // we will miss: nodes without contacts at the end of sequence and gaps (unobserved residues) at the end of the sequence.
170     // We don't know more without nodes and sequence
171     Contact maxCont = Collections.max(contacts);
172     this.fullLength= Math.max(maxCont.i,maxCont.j);
173     // in this case nodes has not been initialised so we set obsLength=fullLength as we don't have the information
174     this.obsLength=fullLength;
175     }
176     this.numContacts=contacts.size();
177 duarte 134 }
178    
179 duarte 135 //TODO implement (from python) write_graph_to_db, do we really need it here??
180    
181     /** get user name from operating system (for use as database username) */
182     private static String getUserName() {
183     String user = null;
184     user = System.getProperty("user.name");
185     if(user == null) {
186     System.err.println("Could not get user name from operating system. Exiting");
187     System.exit(1);
188     }
189     return user;
190     }
191 duarte 144
192     public void read_graph_from_file (String contactsfile) throws FileNotFoundException, IOException {
193 duarte 134 contacts = new ArrayList<Contact>();
194     System.out.println("Reading contacts from file "+contactsfile);
195     BufferedReader fcont = new BufferedReader(new FileReader(new File(contactsfile)));
196     String line;
197     while ((line = fcont.readLine() ) != null ) {
198 duarte 144 Pattern p = Pattern.compile("^#");
199     Matcher m = p.matcher(line);
200     if (m.find()){
201     // Pattern ps = Pattern.compile("^#VER: (\\d\\.\\d)");
202     // Matcher ms = ps.matcher(line);
203     // if (ms.find()){
204     // if (!ms.group(1).equals(GRAPHFILEFORMATVERSION)){
205     // throw new GraphFileFormatError("The graph file "+contactsfile+" can't be read, wrong file format version");
206     // }
207     // }
208     Pattern ps = Pattern.compile("^#SEQUENCE:\\s*(\\w+)$");
209     Matcher ms = ps.matcher(line);
210     if (ms.find()){
211     sequence=ms.group(1);
212     }
213     ps = Pattern.compile("^#PDB:\\s*(\\w+)");
214     ms = ps.matcher(line);
215     if (ms.find()){
216     accode=ms.group(1);
217     }
218     ps = Pattern.compile("^#PDB CHAIN CODE:\\s*(\\w)");
219     ms = ps.matcher(line);
220     if (ms.find()){
221     chaincode=ms.group(1);
222     }
223     ps = Pattern.compile("^#CHAIN:\\s*(\\w)");
224     ms = ps.matcher(line);
225     if (ms.find()){
226     chain=ms.group(1);
227     }
228     ps = Pattern.compile("^#CT:\\s*([a-zA-Z/]+)");
229     ms = ps.matcher(line);
230     if (ms.find()){
231     ct=ms.group(1);
232     }
233     ps = Pattern.compile("^#CUTOFF:\\s*(\\d+\\.\\d+)");
234     ms = ps.matcher(line);
235     if (ms.find()){
236     cutoff=Double.parseDouble(ms.group(1));
237     }
238     }
239     else{
240     int i = Integer.parseInt(line.split("\\s+")[0]);
241     int j = Integer.parseInt(line.split("\\s+")[1]);
242     contacts.add(new Contact(i,j));
243     }
244 duarte 134 }
245     fcont.close();
246 duarte 152 // if sequence was given we take nodes from it
247 duarte 144 nodes = new TreeMap<Integer, String>();
248     for (int i=0;i<sequence.length();i++){
249     String letter = String.valueOf(sequence.charAt(i));
250 duarte 152 nodes.put(i+1, AA.oneletter2threeletter(letter));
251 duarte 144 }
252    
253 duarte 134 }
254    
255 duarte 135 /**
256     * Reads contacts and nodes from db.
257     * The db must be a graph db following our standard format, i.e. must have tables:
258     * chain_graph, single_model_graph, single_model_node, single_model_edge
259     * We don't care here about the origin of the data (msdsd, pdbase, predicted) for the generation of the graph as long as it follows our data format
260     * We read both edges and nodes from single_model_edge and single_model_node.
261 duarte 152 * The sequence is set to blank, as we can't get the full sequence from graph db
262 duarte 135 * @param conn
263     */
264     public void read_graph_from_db(MySQLConnection conn){
265     contacts = new ArrayList<Contact>();
266     nodes = new TreeMap<Integer, String>();
267     try {
268 duarte 142 // we read only half of the matrix (contacts in one direction only) so that we have the same type of contacts as when creating Graph from Pdb object
269 duarte 135 String sql="SELECT i_num,j_num FROM single_model_edge WHERE graph_id="+graphid+" AND j_num>i_num ORDER BY i_num,j_num ";
270     Statement stmt = conn.createStatement();
271     ResultSet rsst = stmt.executeQuery(sql);
272     while (rsst.next()) {
273     int i=rsst.getInt(1);
274     int j=rsst.getInt(2);
275     contacts.add(new Contact(i,j));
276     }
277     rsst.close();
278     stmt.close();
279     sql="SELECT num,res FROM single_model_node WHERE graph_id="+graphid+" ORDER BY num ";
280     stmt = conn.createStatement();
281     rsst = stmt.executeQuery(sql);
282     while (rsst.next()){
283     int num=rsst.getInt(1);
284     String res=rsst.getString(2);
285     nodes.put(num, AA.oneletter2threeletter(res));
286     }
287     rsst.close();
288     stmt.close();
289     } catch (SQLException e) {
290     e.printStackTrace();
291     }
292    
293     }
294    
295 duarte 141 public void getgraphid (MySQLConnection conn, String chaincode) throws GraphIdNotFoundError{
296     // input is chaincode i.e. pdb chain code
297     // we take chain (internal chain identifier, pchain_code for msdsd and asym_id for pdbase) from pchain_code field in chain_graph
298     // (in the chain_graph table the internal chain identifier is called 'pchain_code')
299 duarte 135 int pgraphid=0;
300 duarte 141 String chainstr="='"+chaincode+"' ";
301     if (chaincode.equals("NULL")){
302     chainstr=" IS NULL ";
303     }
304 duarte 135 try {
305 duarte 141 String sql="SELECT graph_id, pchain_code FROM chain_graph WHERE accession_code='"+accode+"' AND chain_pdb_code"+chainstr+" AND dist="+cutoff;
306 duarte 135 Statement stmt = conn.createStatement();
307     ResultSet rsst = stmt.executeQuery(sql);
308     int check=0;
309     while (rsst.next()) {
310     check++;
311     pgraphid=rsst.getInt(1);
312 duarte 141 chain=rsst.getString(2);
313 duarte 135 }
314     if (check!=1){
315 duarte 141 System.err.println("No pgraph_id match or more than 1 match for accession_code="+accode+", chain_pdb_code="+chaincode+", dist="+cutoff);
316 duarte 135 }
317     rsst.close();
318     stmt.close();
319     // we set the ctstr to the same as ct except in ALL case, where it is BB+SC+BB/SC
320     String ctstr=ct;
321     if (ct.equals("ALL")){
322     ctstr="BB+SC+BB/SC";
323     }
324     sql="SELECT graph_id,single_model_id FROM single_model_graph WHERE pgraph_id="+pgraphid+" AND CT='"+ctstr+"' AND dist="+cutoff+" AND CR='(true)' AND CW=1";
325     stmt = conn.createStatement();
326     rsst = stmt.executeQuery(sql);
327     check=0;
328     while (rsst.next()){
329     check++;
330     graphid=rsst.getInt(1);
331     sm_id=rsst.getInt(2);
332     }
333     if (check!=1){
334     System.err.println("No graph_id match or more than 1 match for pgraph_id="+pgraphid+", CT="+ctstr+" and cutoff="+cutoff);
335     throw new GraphIdNotFoundError("No graph_id match or more than 1 match for pgraph_id="+pgraphid+", CT="+ctstr+" and cutoff="+cutoff);
336     }
337     } catch (SQLException e) {
338     e.printStackTrace();
339     }
340    
341     }
342    
343 duarte 152 public void get_db_graph_info(MySQLConnection conn) throws GraphIdNotFoundError {
344     try {
345     int pgraphid=0;
346     String sql="SELECT pgraph_id,CT,dist FROM single_model_graph WHERE graph_id="+graphid;
347     Statement stmt = conn.createStatement();
348     ResultSet rsst = stmt.executeQuery(sql);
349     int check=0;
350     while (rsst.next()) {
351     check++;
352     pgraphid=rsst.getInt(1);
353     ct=rsst.getString(2);
354     if (ct.equals("BB+SC+BB/SC")) ct="ALL";
355     cutoff=rsst.getDouble(3);
356     }
357     if (check!=1){
358     System.err.println("No pgraph_id match or more than 1 match for graph_id="+graphid);
359     throw new GraphIdNotFoundError("No pgraph_id match or more than 1 match for graph_id="+graphid+" in db"+conn.getDbname());
360     }
361     rsst.close();
362     stmt.close();
363     sql="SELECT accession_code, chain_pdb_code, pchain_code FROM chain_graph WHERE graph_id="+pgraphid;
364     stmt = conn.createStatement();
365     rsst = stmt.executeQuery(sql);
366     check=0;
367     while (rsst.next()){
368     check++;
369     accode=rsst.getString(1);
370     chaincode=rsst.getString(2);
371 duarte 158 // java returns a null if the field is a database null, we want actually the "NULL" string in that case
372     if (chaincode==null) chaincode="NULL";
373 duarte 152 chain=rsst.getString(3);
374     }
375     if (check!=1){
376     System.err.println("No accession_code+chain_pdb_code+pchain_code match or more than 1 match for graph_id="+pgraphid+" in chain_graph table");
377     }
378     rsst.close();
379     stmt.close();
380     } catch (SQLException e) {
381     e.printStackTrace();
382     }
383    
384     }
385    
386 duarte 123 public void write_contacts_to_file (String outfile) throws IOException {
387     PrintStream Out = new PrintStream(new FileOutputStream(outfile));
388     for (Contact pair:contacts){
389     int i_resser=pair.i;
390     int j_resser=pair.j;
391     Out.println(i_resser+"\t"+j_resser);
392     }
393     Out.close();
394     }
395 duarte 144
396     public void write_graph_to_file (String outfile) throws IOException {
397     PrintStream Out = new PrintStream(new FileOutputStream(outfile));
398     Out.println("#VER: "+GRAPHFILEFORMATVERSION);
399     Out.println("#SEQUENCE: "+sequence);
400     Out.println("#PDB: "+accode);
401     Out.println("#PDB CHAIN CODE: "+chaincode);
402     Out.println("#CHAIN: "+chain);
403     Out.println("#CT: "+ct);
404     Out.println("#CUTOFF: "+cutoff);
405     for (Contact pair:contacts){
406     int i_resser=pair.i;
407     int j_resser=pair.j;
408     Out.println(i_resser+"\t"+j_resser);
409     }
410     Out.close();
411     }
412    
413 duarte 159 /**
414     * Returns an int matrix with 1s for contacts and 0s for non contacts, i.e. the contact map
415     * In non-crossed cases this should give us the upper half matrix (contacts are only j>i)
416     * In crossed cases this gives us a full matrix (contacts are both j>i and i>j since they are directed)
417     * @return
418     */
419     public int[][] getIntMatrix(){
420     // this initialises the matrix to 0 (i.e. no contact)
421     int[][] cm = new int[fullLength][fullLength];
422     // we put a 1 for all given contacts
423     for (Contact cont:contacts){
424     int i_resser = cont.i;
425     int j_resser = cont.j;
426     cm[i_resser-1][j_resser-1]=1;
427 duarte 129 }
428     return cm;
429     }
430 duarte 159
431 duarte 123 }