ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/proteinstructure/Graph.java
Revision: 179
Committed: Thu May 31 09:41:41 2007 UTC (17 years, 8 months ago) by duarte
File size: 18901 byte(s)
Log Message:
FIXED BUG: fixed restrictContactsToMax/MinRange. Couldn't delete contacts at the same time as looping in them
NEW CLASSES NodeNbh and EdgeNbh, now used in getNodeNbh and getEdgeNbh
New method getResidueType
Line User Rev File contents
1 duarte 123 package proteinstructure;
2 duarte 134 import java.io.BufferedReader;
3     import java.io.File;
4     import java.io.FileNotFoundException;
5 duarte 123 import java.io.FileOutputStream;
6 duarte 134 import java.io.FileReader;
7 duarte 123 import java.io.PrintStream;
8     import java.io.IOException;
9 duarte 135 import java.sql.ResultSet;
10     import java.sql.SQLException;
11     import java.sql.Statement;
12 duarte 159 import java.util.Collections;
13 duarte 129 import java.util.TreeMap;
14 duarte 144 import java.util.regex.Matcher;
15     import java.util.regex.Pattern;
16 duarte 135 import tools.MySQLConnection;
17 duarte 123
18    
19     public class Graph {
20    
21 duarte 135 public final static String MYSQLSERVER="white";
22     public final static String MYSQLUSER=getUserName();
23     public final static String MYSQLPWD="nieve";
24 duarte 144
25     public final static String GRAPHFILEFORMATVERSION = "1.0";
26 duarte 135
27 duarte 175 ContactList contacts;
28 duarte 135 // nodes is a TreeMap of residue serials to residue types (3 letter code)
29 duarte 129 TreeMap<Integer,String> nodes;
30 duarte 159 public String sequence; // the full sequence (with unobserved residues and non-standard aas ='X')
31 duarte 143 public String accode;
32     public String chain;
33 duarte 159 public String chaincode="";
34 duarte 146 public double cutoff;
35     public String ct;
36 stehr 161 public boolean directed=false;
37 duarte 123
38 duarte 159 // fullLength is length of full sequence or:
39     // -if sequence not provided (when reading from db): length of everything except possible unobserved residues at end of chain
40     // -if sequence and nodes not provided (when reading from file and sequence field missing): length except possible unobserved residues at end of chain and possible nodes without contacts at end of chain
41     public int fullLength;
42     public int obsLength; // length without unobserved, non standard aas
43    
44     public int numContacts;
45    
46 duarte 175 public boolean modified;
47    
48 duarte 135 // these 2 fields only used when reading from db
49     int graphid=0;
50     int sm_id=0;
51    
52 duarte 134 /**
53     * Constructs Graph object by passing ArrayList with contacts and TreeMap with nodes (res serials and types)
54     * Must also pass contact type, cutoff, accession code and chain
55     * @param contacts
56     * @param nodes
57     * @param sequence
58     * @param cutoff
59     * @param ct
60     * @param accode
61     * @param chain
62     */
63 duarte 175 public Graph (ContactList contacts, TreeMap<Integer,String> nodes, String sequence, double cutoff,String ct, String accode, String chain, String chaincode) {
64 duarte 123 this.contacts=contacts;
65     this.cutoff=cutoff;
66 duarte 129 this.nodes=nodes;
67     this.sequence=sequence;
68     this.accode=accode;
69     this.chain=chain;
70 duarte 156 this.chaincode=chaincode;
71 duarte 123 this.ct=ct;
72 duarte 159 this.fullLength=sequence.length();
73     this.obsLength=nodes.size();
74     this.numContacts=contacts.size();
75 duarte 175 this.modified=false;
76 duarte 129 if (ct.contains("/")){
77     directed=true;
78     }
79 duarte 123 }
80 duarte 135
81     /**
82 duarte 141 * Constructs Graph object from graph db, given the dbname, accode, chaincode (classic pdb chain code), ct and cutoff
83 duarte 135 * @param dbname
84     * @param accode
85 duarte 141 * @param chaincode
86 duarte 135 * @param cutoff
87     * @param ct
88     */
89 duarte 141 public Graph(String dbname, String accode, String chaincode, double cutoff, String ct) throws GraphIdNotFoundError{
90 duarte 135 this.cutoff=cutoff;
91     this.accode=accode;
92     this.ct=ct;
93 duarte 142 // we set the sequence to empty when we read from graph db. We don't have the full sequence in graph db
94     // when we pass the sequence in getCM to the ContactMap constructor we want to have either a full sequence (with unobserveds) or a blank in case we don't have the info
95     this.sequence="";
96 duarte 135 //TODO graphs in db are never directed, so this doesn't really apply here. Must solve all this!
97     if (ct.contains("/")){
98     directed=true;
99     }
100     MySQLConnection conn = new MySQLConnection(MYSQLSERVER,MYSQLUSER,MYSQLPWD,dbname);
101 duarte 141 getgraphid(conn, chaincode); // initialises graphid, sm_id and chain
102     read_graph_from_db(conn); // gets contacts, nodes and sequence
103 duarte 135 conn.close();
104 duarte 159 this.obsLength=nodes.size();
105     if (!sequence.equals("")){
106     this.fullLength=sequence.length();
107     } else {
108     // if nodes TreeMap has correct residue numbering then this should get the right full length,
109     // we will only miss: gaps (unobserved residues) at the end of the sequence. Those we can't know unless full sequence is given
110     this.fullLength=Collections.max(nodes.keySet());
111     }
112     this.numContacts=contacts.size();
113 duarte 175 this.modified=false;
114 duarte 135 }
115 duarte 152
116     /**
117     * Constructs Graph object from graph db, given the graphid
118     * @param dbname
119     * @param graphid
120     */
121     public Graph(String dbname,int graphid) throws GraphIdNotFoundError{
122     this.graphid=graphid;
123     // we set the sequence to empty when we read from graph db. We don't have the full sequence in graph db
124     // when we pass the sequence in getCM to the ContactMap constructor we want to have either a full sequence (with unobserveds) or a blank in case we don't have the info
125     this.sequence="";
126     MySQLConnection conn = new MySQLConnection(MYSQLSERVER,MYSQLUSER,MYSQLPWD,dbname);
127     read_graph_from_db(conn); // gets contacts, nodes and sequence
128     get_db_graph_info(conn); // gets accode, chaincode, chain, ct and cutoff from db (from graph_id)
129     conn.close();
130     //TODO graphs in db are never directed, so this doesn't really apply here. Must solve all this!
131     if (ct.contains("/")){
132     directed=true;
133     }
134 duarte 159 this.obsLength=nodes.size();
135     if (!sequence.equals("")){
136     this.fullLength=sequence.length();
137     } else {
138     // if nodes TreeMap has correct residue numbering then this should get the right full length,
139     // we will only miss: gaps (unobserved residues) at the end of the sequence. Those we can't know unless full sequence is given
140     this.fullLength=Collections.max(nodes.keySet());
141     }
142     this.numContacts=contacts.size();
143 duarte 175 this.modified=false;
144 duarte 152 }
145 duarte 129
146 duarte 134 /**
147     * Constructs Graph object by reading a file with contacts
148 duarte 144 * If the contacts file doesn't have the sequence then the graph object won't have sequence or nodes
149     * That means it won't be possible to get a ContactMap from it using getCM because CM needs both sequence and nodes
150 duarte 134 * @param contactsfile
151     * @throws IOException
152     * @throws FileNotFoundException
153     */
154 duarte 144 public Graph (String contactsfile) throws IOException, FileNotFoundException{
155 duarte 142 // we set the sequence to blank when we read from file as we don't have the full sequence
156 duarte 144 // if sequence is present in contactsfile then is read from there
157 duarte 142 this.sequence="";
158 duarte 144 this.ct="";
159     this.cutoff=0.0;
160 duarte 155 // we initialise accode, chain and chaincode to empty strings in case the file doesn't specify then
161     this.accode="";
162     this.chain="";
163     this.chaincode="";
164 duarte 134 if (ct.contains("/")){
165     directed=true;
166     }
167 duarte 159 read_graph_from_file(contactsfile); // initialises contacts, and nodes (only if sequence is given)
168     if (!sequence.equals("")){
169     this.fullLength=sequence.length();
170     this.obsLength=nodes.size();
171     } else {
172     // if contacts have correct residue numbering then this should get the right full length up to the maximum node that makes a contact,
173     // we will miss: nodes without contacts at the end of sequence and gaps (unobserved residues) at the end of the sequence.
174     // We don't know more without nodes and sequence
175 duarte 175 this.fullLength=contacts.getMaxNode();
176 duarte 159 // in this case nodes has not been initialised so we set obsLength=fullLength as we don't have the information
177     this.obsLength=fullLength;
178     }
179     this.numContacts=contacts.size();
180 duarte 175 this.modified=false;
181 duarte 134 }
182    
183 duarte 135 //TODO implement (from python) write_graph_to_db, do we really need it here??
184    
185     /** get user name from operating system (for use as database username) */
186     private static String getUserName() {
187     String user = null;
188     user = System.getProperty("user.name");
189     if(user == null) {
190     System.err.println("Could not get user name from operating system. Exiting");
191     System.exit(1);
192     }
193     return user;
194     }
195 duarte 144
196     public void read_graph_from_file (String contactsfile) throws FileNotFoundException, IOException {
197 duarte 175 contacts = new ContactList();
198 duarte 134 System.out.println("Reading contacts from file "+contactsfile);
199     BufferedReader fcont = new BufferedReader(new FileReader(new File(contactsfile)));
200     String line;
201     while ((line = fcont.readLine() ) != null ) {
202 duarte 144 Pattern p = Pattern.compile("^#");
203     Matcher m = p.matcher(line);
204     if (m.find()){
205     // Pattern ps = Pattern.compile("^#VER: (\\d\\.\\d)");
206     // Matcher ms = ps.matcher(line);
207     // if (ms.find()){
208     // if (!ms.group(1).equals(GRAPHFILEFORMATVERSION)){
209     // throw new GraphFileFormatError("The graph file "+contactsfile+" can't be read, wrong file format version");
210     // }
211     // }
212     Pattern ps = Pattern.compile("^#SEQUENCE:\\s*(\\w+)$");
213     Matcher ms = ps.matcher(line);
214     if (ms.find()){
215     sequence=ms.group(1);
216     }
217     ps = Pattern.compile("^#PDB:\\s*(\\w+)");
218     ms = ps.matcher(line);
219     if (ms.find()){
220     accode=ms.group(1);
221     }
222     ps = Pattern.compile("^#PDB CHAIN CODE:\\s*(\\w)");
223     ms = ps.matcher(line);
224     if (ms.find()){
225     chaincode=ms.group(1);
226     }
227     ps = Pattern.compile("^#CHAIN:\\s*(\\w)");
228     ms = ps.matcher(line);
229     if (ms.find()){
230     chain=ms.group(1);
231     }
232     ps = Pattern.compile("^#CT:\\s*([a-zA-Z/]+)");
233     ms = ps.matcher(line);
234     if (ms.find()){
235     ct=ms.group(1);
236     }
237     ps = Pattern.compile("^#CUTOFF:\\s*(\\d+\\.\\d+)");
238     ms = ps.matcher(line);
239     if (ms.find()){
240     cutoff=Double.parseDouble(ms.group(1));
241     }
242     }
243     else{
244     int i = Integer.parseInt(line.split("\\s+")[0]);
245     int j = Integer.parseInt(line.split("\\s+")[1]);
246     contacts.add(new Contact(i,j));
247     }
248 duarte 134 }
249     fcont.close();
250 duarte 152 // if sequence was given we take nodes from it
251 duarte 144 nodes = new TreeMap<Integer, String>();
252     for (int i=0;i<sequence.length();i++){
253     String letter = String.valueOf(sequence.charAt(i));
254 duarte 152 nodes.put(i+1, AA.oneletter2threeletter(letter));
255 duarte 144 }
256    
257 duarte 134 }
258    
259 duarte 135 /**
260     * Reads contacts and nodes from db.
261     * The db must be a graph db following our standard format, i.e. must have tables:
262     * chain_graph, single_model_graph, single_model_node, single_model_edge
263     * We don't care here about the origin of the data (msdsd, pdbase, predicted) for the generation of the graph as long as it follows our data format
264     * We read both edges and nodes from single_model_edge and single_model_node.
265 duarte 152 * The sequence is set to blank, as we can't get the full sequence from graph db
266 duarte 135 * @param conn
267     */
268     public void read_graph_from_db(MySQLConnection conn){
269 duarte 175 contacts = new ContactList();
270 duarte 135 nodes = new TreeMap<Integer, String>();
271     try {
272 duarte 142 // we read only half of the matrix (contacts in one direction only) so that we have the same type of contacts as when creating Graph from Pdb object
273 duarte 135 String sql="SELECT i_num,j_num FROM single_model_edge WHERE graph_id="+graphid+" AND j_num>i_num ORDER BY i_num,j_num ";
274     Statement stmt = conn.createStatement();
275     ResultSet rsst = stmt.executeQuery(sql);
276     while (rsst.next()) {
277     int i=rsst.getInt(1);
278     int j=rsst.getInt(2);
279     contacts.add(new Contact(i,j));
280     }
281     rsst.close();
282     stmt.close();
283     sql="SELECT num,res FROM single_model_node WHERE graph_id="+graphid+" ORDER BY num ";
284     stmt = conn.createStatement();
285     rsst = stmt.executeQuery(sql);
286     while (rsst.next()){
287     int num=rsst.getInt(1);
288     String res=rsst.getString(2);
289     nodes.put(num, AA.oneletter2threeletter(res));
290     }
291     rsst.close();
292     stmt.close();
293     } catch (SQLException e) {
294     e.printStackTrace();
295     }
296    
297     }
298    
299 duarte 141 public void getgraphid (MySQLConnection conn, String chaincode) throws GraphIdNotFoundError{
300     // input is chaincode i.e. pdb chain code
301     // we take chain (internal chain identifier, pchain_code for msdsd and asym_id for pdbase) from pchain_code field in chain_graph
302     // (in the chain_graph table the internal chain identifier is called 'pchain_code')
303 duarte 135 int pgraphid=0;
304 duarte 141 String chainstr="='"+chaincode+"' ";
305     if (chaincode.equals("NULL")){
306     chainstr=" IS NULL ";
307     }
308 duarte 135 try {
309 duarte 141 String sql="SELECT graph_id, pchain_code FROM chain_graph WHERE accession_code='"+accode+"' AND chain_pdb_code"+chainstr+" AND dist="+cutoff;
310 duarte 135 Statement stmt = conn.createStatement();
311     ResultSet rsst = stmt.executeQuery(sql);
312     int check=0;
313     while (rsst.next()) {
314     check++;
315     pgraphid=rsst.getInt(1);
316 duarte 141 chain=rsst.getString(2);
317 duarte 135 }
318     if (check!=1){
319 duarte 141 System.err.println("No pgraph_id match or more than 1 match for accession_code="+accode+", chain_pdb_code="+chaincode+", dist="+cutoff);
320 duarte 135 }
321     rsst.close();
322     stmt.close();
323     // we set the ctstr to the same as ct except in ALL case, where it is BB+SC+BB/SC
324     String ctstr=ct;
325     if (ct.equals("ALL")){
326     ctstr="BB+SC+BB/SC";
327     }
328     sql="SELECT graph_id,single_model_id FROM single_model_graph WHERE pgraph_id="+pgraphid+" AND CT='"+ctstr+"' AND dist="+cutoff+" AND CR='(true)' AND CW=1";
329     stmt = conn.createStatement();
330     rsst = stmt.executeQuery(sql);
331     check=0;
332     while (rsst.next()){
333     check++;
334     graphid=rsst.getInt(1);
335     sm_id=rsst.getInt(2);
336     }
337     if (check!=1){
338     System.err.println("No graph_id match or more than 1 match for pgraph_id="+pgraphid+", CT="+ctstr+" and cutoff="+cutoff);
339     throw new GraphIdNotFoundError("No graph_id match or more than 1 match for pgraph_id="+pgraphid+", CT="+ctstr+" and cutoff="+cutoff);
340     }
341     } catch (SQLException e) {
342     e.printStackTrace();
343     }
344    
345     }
346    
347 duarte 152 public void get_db_graph_info(MySQLConnection conn) throws GraphIdNotFoundError {
348     try {
349     int pgraphid=0;
350     String sql="SELECT pgraph_id,CT,dist FROM single_model_graph WHERE graph_id="+graphid;
351     Statement stmt = conn.createStatement();
352     ResultSet rsst = stmt.executeQuery(sql);
353     int check=0;
354     while (rsst.next()) {
355     check++;
356     pgraphid=rsst.getInt(1);
357     ct=rsst.getString(2);
358     if (ct.equals("BB+SC+BB/SC")) ct="ALL";
359     cutoff=rsst.getDouble(3);
360     }
361     if (check!=1){
362     System.err.println("No pgraph_id match or more than 1 match for graph_id="+graphid);
363     throw new GraphIdNotFoundError("No pgraph_id match or more than 1 match for graph_id="+graphid+" in db"+conn.getDbname());
364     }
365     rsst.close();
366     stmt.close();
367     sql="SELECT accession_code, chain_pdb_code, pchain_code FROM chain_graph WHERE graph_id="+pgraphid;
368     stmt = conn.createStatement();
369     rsst = stmt.executeQuery(sql);
370     check=0;
371     while (rsst.next()){
372     check++;
373     accode=rsst.getString(1);
374     chaincode=rsst.getString(2);
375 duarte 158 // java returns a null if the field is a database null, we want actually the "NULL" string in that case
376     if (chaincode==null) chaincode="NULL";
377 duarte 152 chain=rsst.getString(3);
378     }
379     if (check!=1){
380     System.err.println("No accession_code+chain_pdb_code+pchain_code match or more than 1 match for graph_id="+pgraphid+" in chain_graph table");
381     }
382     rsst.close();
383     stmt.close();
384     } catch (SQLException e) {
385     e.printStackTrace();
386     }
387    
388     }
389    
390 duarte 123 public void write_contacts_to_file (String outfile) throws IOException {
391     PrintStream Out = new PrintStream(new FileOutputStream(outfile));
392     for (Contact pair:contacts){
393     int i_resser=pair.i;
394     int j_resser=pair.j;
395     Out.println(i_resser+"\t"+j_resser);
396     }
397     Out.close();
398     }
399 duarte 144
400     public void write_graph_to_file (String outfile) throws IOException {
401     PrintStream Out = new PrintStream(new FileOutputStream(outfile));
402     Out.println("#VER: "+GRAPHFILEFORMATVERSION);
403     Out.println("#SEQUENCE: "+sequence);
404     Out.println("#PDB: "+accode);
405     Out.println("#PDB CHAIN CODE: "+chaincode);
406     Out.println("#CHAIN: "+chain);
407     Out.println("#CT: "+ct);
408     Out.println("#CUTOFF: "+cutoff);
409     for (Contact pair:contacts){
410     int i_resser=pair.i;
411     int j_resser=pair.j;
412     Out.println(i_resser+"\t"+j_resser);
413     }
414     Out.close();
415     }
416 duarte 175
417 duarte 159 /**
418 duarte 175 * Gets list of contacts as a new ContactList (deep copied)
419     *
420     */
421     public ContactList getContacts(){
422     ContactList newContacts = new ContactList();
423     for (Contact cont:contacts){
424     newContacts.add(new Contact(cont.i,cont.j));
425     }
426     return newContacts;
427     }
428    
429     /**
430     * Gets TreeMap of nodes, deep copying
431     *
432     */
433     public TreeMap<Integer,String> getNodes(){
434     TreeMap<Integer,String> newNodes = new TreeMap<Integer,String>();
435     for (int resser:nodes.keySet()){
436     newNodes.put(resser, nodes.get(resser));
437     }
438     return newNodes;
439     }
440    
441     /**
442     * Deep copies this Graph object returning new one
443     * @return
444     */
445     public Graph copy(){
446     return new Graph(getContacts(),getNodes(),sequence,cutoff,ct,accode,chain,chaincode);
447     }
448    
449     /**
450 duarte 159 * Returns an int matrix with 1s for contacts and 0s for non contacts, i.e. the contact map
451     * In non-crossed cases this should give us the upper half matrix (contacts are only j>i)
452     * In crossed cases this gives us a full matrix (contacts are both j>i and i>j since they are directed)
453     * @return
454     */
455     public int[][] getIntMatrix(){
456     // this initialises the matrix to 0 (i.e. no contact)
457     int[][] cm = new int[fullLength][fullLength];
458     // we put a 1 for all given contacts
459     for (Contact cont:contacts){
460     int i_resser = cont.i;
461     int j_resser = cont.j;
462     cm[i_resser-1][j_resser-1]=1;
463 duarte 129 }
464     return cm;
465     }
466 duarte 159
467 duarte 165 /**
468 duarte 179 * Gets a node's residue type given the residue serial
469     * @param resser
470     * @return
471     */
472     public String getResType(int resser){
473     return nodes.get(resser);
474     }
475    
476     /**
477 duarte 165 * Gets node neighbourhood given a residue serial
478     * @param resser
479     * @return
480     */
481 duarte 179 public NodeNbh getNodeNbh(int resser){
482     NodeNbh nbh = new NodeNbh(resser, getResType(resser));
483 duarte 165 //this could be implemented using the contact map matrix and scanning through 1 column/row
484     //it would be just slightly faster, here we do 2*numContacts iterations, using matrix would be only fullLength iterations
485 duarte 179 //however we would then have the overhead of creating the matrix
486 duarte 165 for (Contact cont:contacts){
487     if (cont.i==resser) nbh.put(cont.j, nodes.get(cont.j));
488     if (cont.j==resser) nbh.put(cont.i, nodes.get(cont.i));
489     }
490     return nbh;
491     }
492    
493     /**
494     * Gets edge neighbourhood (common neighbourhood) given a residue serial pair
495     * @param i_resser
496     * @param j_resser
497     * @return
498     */
499 duarte 179 public EdgeNbh getEdgeNbh(int i_resser, int j_resser){
500     EdgeNbh nbh = new EdgeNbh(i_resser, getResType(i_resser), j_resser, getResType(j_resser));
501     NodeNbh i_nbhd = getNodeNbh(i_resser);
502     NodeNbh j_nbhd = getNodeNbh(j_resser);
503 duarte 175 if (j_nbhd.size()>=i_nbhd.size()) { //with this we will be slightly faster, always iterating through smallest TreeMap
504     for (int resser:i_nbhd.keySet()) {
505     if (j_nbhd.containsKey(resser)) nbh.put(resser, i_nbhd.get(resser));
506     }
507     } else {
508     for (int resser:j_nbhd.keySet()) {
509     if (i_nbhd.containsKey(resser)) nbh.put(resser, j_nbhd.get(resser));
510     }
511 duarte 165 }
512     return nbh;
513     }
514    
515 duarte 175 public void addEdge(Contact cont){
516     contacts.add(cont);
517     numContacts++;
518     modified=true;
519     }
520    
521     public void delEdge(Contact cont){
522     contacts.remove(cont);
523     numContacts--;
524     modified=true;
525     }
526    
527     public void restrictContactsToMaxRange(int range){
528 duarte 179 ContactList edgesToDelete = new ContactList();
529 duarte 175 for (Contact cont:contacts){
530 duarte 179 if (cont.getRange()>range) edgesToDelete.add(cont);
531 duarte 175 }
532 duarte 179 for (Contact cont:edgesToDelete){
533     delEdge(cont);
534     }
535 duarte 175 }
536    
537     public void restrictContactsToMinRange(int range){
538 duarte 179 ContactList edgesToDelete = new ContactList();
539 duarte 175 for (Contact cont:contacts){
540 duarte 179 if (cont.getRange()<range) edgesToDelete.add(cont);
541 duarte 175 }
542 duarte 179 for (Contact cont:edgesToDelete){
543     delEdge(cont);
544     }
545 duarte 175 }
546    
547 duarte 123 }