ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/tools/Msdsd2Pdb.java
Revision: 40
Committed: Tue Mar 21 18:21:00 2006 UTC (18 years, 6 months ago) by filippis
File size: 10883 byte(s)
Log Message:
Adding Msdsd2Pdb and PyMol classes
Line File contents
1 package tools;
2
3 import java.io.*;
4
5 /**
6 * Package: tools
7 * Class: PyMol
8 * Author: Ioannis Filippis, filippis@molgen.mpg.de
9 * Date: 21/03/2006
10 *
11 * Msdsd2Pdb's static export2File method creates a pdb file by exporting the atom
12 * lines directly from the msdsd. This is needed for the visualization of the
13 * biological units since pdb files contain the ASUs. Moreover, contact graphs
14 * are defined using msdsd-custom fields (like chain code and residue serial)
15 * and mapping to pdb fields would be necessary for the graph visualisation
16 * if the original pdb files were preferred.
17 *
18 * Notes:
19 * - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of
20 * amino acids, only the default location is considered (alt_code = "A")
21 * (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/)
22 * - The filename is either accessionCode_assemblyId_modelId.pdb (biological unit)
23 * or accessionCode.pdb (asu)
24 * - There is also the oportunity to send the atom lines directly to PyMol and
25 * loading the structure without intermediate files. Look at PyMol class and
26 * sendAtomLines method.
27 *
28 * Changelog:
29 * 21/03/06 first created by IF
30 */
31
32 public class Msdsd2Pdb {
33
34 /**
35 * exports to file the atom lines of a model (modelId) of a biological unit (assemblyId)
36 * of a protein (accessionCode) directly from msdsd. The filename is returned.
37 *
38 * Notes:
39 * - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of
40 * amino acids, only the default location is considered (alt_code = "A")
41 * (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/)
42 * - The filename is accessionCode_assemblyId_modelId.pdb (biological unit).
43 * - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used
44 * the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol
45 * selections.
46 * - There are two versions of export2File. One that takes the atomic coordinates from the
47 * partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster),
48 * while the other uses the merged table (really slow - should be avoided)
49 */
50 public static String export2File(String accessionCode, int assemblyId, int modelId, String pdbDir) {
51
52 String pdbFileName = accessionCode+"_"+assemblyId+"_"+modelId+".pdb";
53 String query = "SELECT CONCAT("+
54 "RPAD(\"ATOM\", 6, \" \"), "+
55 "LPAD(serial, 5, \" \"), "+
56 "\" \", "+
57 "LPAD(chem_atom_name, 4, \" \"), "+
58 "IF(alt_code IS NULL, \" \", alt_code), "+
59 "code_3_letter, "+
60 "\" \", "+
61 "IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+
62 "LPAD(residue_serial, 4, \" \"), "+
63 "IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+
64 "REPEAT(\" \", 3), "+
65 "LPAD(x, 8, \" \"), "+
66 "LPAD(y, 8, \" \"), "+
67 "LPAD(z, 8, \" \"), "+
68 "LPAD(occupancy, 6, \" \"), "+
69 "REPEAT(\" \", 6), "+
70 "REPEAT(\" \", 6), "+
71 "RPAD(chain_code, 4, \" \") "+
72 ") AS atom_lines FROM msdsd.atom_data WHERE "+
73 "(assembly_id = "+assemblyId+") AND "+
74 "(model_id = "+modelId+") AND "+
75 "((alt_code = \"A\") OR (alt_code IS NULL)) AND "+
76 "(pdb_group = \"A\") "+
77 "ORDER BY chain_code, residue_serial, serial;";
78
79 gen(pdbDir+"/"+pdbFileName, query);
80
81 return pdbFileName;
82
83 }
84
85 /**
86 * exports to file the atom lines of a model (modelId) of a biological unit (assemblyId)
87 * of a protein (accessionCode) directly from msdsd. The filename is returned.
88 *
89 * Notes:
90 * - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of
91 * amino acids, only the default location is considered (alt_code = "A")
92 * (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/)
93 * - The filename is accessionCode_assemblyId_modelId.pdb (biological unit).
94 * - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used
95 * the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol
96 * selections.
97 * - There are two versions of export2File. One that takes the atomic coordinates from the
98 * partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster),
99 * while the other uses the merged table (really slow - should be avoided)
100 */
101 public static String export2File(String accessionCode, int assemblyId, int modelId, int atomDataTblNum, String pdbDir) {
102
103 String pdbFileName = accessionCode+"_"+assemblyId+"_"+modelId+".pdb";
104 String query = "SELECT CONCAT("+
105 "RPAD(\"ATOM\", 6, \" \"), "+
106 "LPAD(serial, 5, \" \"), "+
107 "\" \", "+
108 "LPAD(chem_atom_name, 4, \" \"), "+
109 "IF(alt_code IS NULL, \" \", alt_code), "+
110 "code_3_letter, "+
111 "\" \", "+
112 "IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+
113 "LPAD(residue_serial, 4, \" \"), "+
114 "IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+
115 "REPEAT(\" \", 3), "+
116 "LPAD(x, 8, \" \"), "+
117 "LPAD(y, 8, \" \"), "+
118 "LPAD(z, 8, \" \"), "+
119 "LPAD(occupancy, 6, \" \"), "+
120 "REPEAT(\" \", 6), "+
121 "REPEAT(\" \", 6), "+
122 "RPAD(chain_code, 4, \" \") "+
123 ") AS atom_lines FROM msdsd.atom_data_"+atomDataTblNum+" WHERE "+
124 "(assembly_id = "+assemblyId+") AND "+
125 "(model_id = "+modelId+") AND "+
126 "((alt_code = \"A\") OR (alt_code IS NULL)) AND "+
127 "(pdb_group = \"A\") "+
128 "ORDER BY chain_code, residue_serial, serial;";
129
130 gen(pdbDir+"/"+pdbFileName, query);
131
132 return pdbFileName;
133
134 }
135
136 /**
137 * exports to file the atom lines of the assymetric unit of a protein (accessionCode)
138 * directly from msdsd. The filename is returned.
139 *
140 * Notes:
141 * - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of
142 * amino acids, only the default location is considered (alt_code = "A")
143 * (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/)
144 * - The filename is accessionCode.pdb (asu).
145 * - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used
146 * the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol
147 * selections.
148 * - There are two versions of export2File. One that takes the atomic coordinates from the
149 * partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster),
150 * while the other uses the merged table (really slow - should be avoided)
151 */
152 public static String export2File(String accessionCode, String pdbDir) {
153
154 String pdbFileName = accessionCode+".pdb";
155 String query = "SELECT CONCAT("+
156 "RPAD(\"ATOM\", 6, \" \"), "+
157 "LPAD(serial, 5, \" \"), "+
158 "\" \", "+
159 "LPAD(chem_atom_name, 4, \" \"), "+
160 "IF(alt_code IS NULL, \" \", alt_code), "+
161 "code_3_letter, "+
162 "\" \", "+
163 "IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+
164 "LPAD(residue_serial, 4, \" \"), "+
165 "IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+
166 "REPEAT(\" \", 3), "+
167 "LPAD(x, 8, \" \"), "+
168 "LPAD(y, 8, \" \"), "+
169 "LPAD(z, 8, \" \"), "+
170 "LPAD(occupancy, 6, \" \"), "+
171 "REPEAT(\" \", 6), "+
172 "REPEAT(\" \", 6), "+
173 "RPAD(chain_code, 4, \" \") "+
174 ") AS atom_lines FROM msdsd.atom_data WHERE "+
175 "(accession_code = \""+accessionCode+"\") AND "+
176 "(non_assembly_valid = \"Y\") AND "+
177 "((alt_code = \"A\") OR (alt_code IS NULL)) AND "+
178 "(pdb_group = \"A\") "+
179 "ORDER BY chain_code, residue_serial, serial;";
180
181 gen(pdbDir+"/"+pdbFileName, query);
182
183 return pdbFileName;
184
185 }
186
187 /**
188 * exports to file the atom lines of the assymetric unit of a protein (accessionCode)
189 * directly from msdsd. The filename is returned.
190 *
191 * Notes:
192 * - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of
193 * amino acids, only the default location is considered (alt_code = "A")
194 * (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/)
195 * - The filename is accessionCode.pdb (asu).
196 * - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used
197 * the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol
198 * selections.
199 * - There are two versions of export2File. One that takes the atomic coordinates from the
200 * partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster),
201 * while the other uses the merged table (really slow - should be avoided)
202 */
203 public static String export2File(String accessionCode, int atomDataTblNum, String pdbDir) {
204
205 String pdbFileName = accessionCode+".pdb";
206 String query = "SELECT CONCAT("+
207 "RPAD(\"ATOM\", 6, \" \"), "+
208 "LPAD(serial, 5, \" \"), "+
209 "\" \", "+
210 "LPAD(chem_atom_name, 4, \" \"), "+
211 "IF(alt_code IS NULL, \" \", alt_code), "+
212 "code_3_letter, "+
213 "\" \", "+
214 "IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+
215 "LPAD(residue_serial, 4, \" \"), "+
216 "IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+
217 "REPEAT(\" \", 3), "+
218 "LPAD(x, 8, \" \"), "+
219 "LPAD(y, 8, \" \"), "+
220 "LPAD(z, 8, \" \"), "+
221 "LPAD(occupancy, 6, \" \"), "+
222 "REPEAT(\" \", 6), "+
223 "REPEAT(\" \", 6), "+
224 "RPAD(chain_code, 4, \" \") "+
225 ") AS atom_lines FROM msdsd.atom_data_"+atomDataTblNum+" WHERE "+
226 "(accession_code = \""+accessionCode+"\") AND "+
227 "(non_assembly_valid = \"Y\") AND "+
228 "((alt_code = \"A\") OR (alt_code IS NULL)) AND "+
229 "(pdb_group = \"A\") "+
230 "ORDER BY chain_code, residue_serial, serial;";
231
232 gen(pdbDir+"/"+pdbFileName, query);
233
234 return pdbFileName;
235
236 }
237
238 /**
239 * creates a temporary sql script called export.sql with the necessary sql query to dump the atom lines,
240 * executes the script redirecting the output to a file and deletes the sql script
241 */
242 private static void gen(String pdbFileName, String query) {
243
244 try {
245
246 File sqlScript = new File("export.sql");
247 PrintWriter scriptOut = new PrintWriter(new FileWriter(sqlScript));
248 scriptOut.println(query);
249 if (scriptOut != null) { scriptOut.close(); }
250
251 System.out.println(SystemCmd.exec(new String[] {"/bin/sh", "-c", "my_lila < export.sql > "+pdbFileName}));
252
253 sqlScript.delete();
254
255 } catch (Exception e) {
256 System.out.println(e);
257 }
258
259 }
260
261 } // end of class Msdsd2Pdb