1 |
package tools; |
2 |
|
3 |
import java.io.*; |
4 |
|
5 |
/** |
6 |
* Package: tools |
7 |
* Class: PyMol |
8 |
* Author: Ioannis Filippis, filippis@molgen.mpg.de |
9 |
* Date: 21/03/2006 |
10 |
* |
11 |
* Msdsd2Pdb's static export2File method creates a pdb file by exporting the atom |
12 |
* lines directly from the msdsd. This is needed for the visualization of the |
13 |
* biological units since pdb files contain the ASUs. Moreover, contact graphs |
14 |
* are defined using msdsd-custom fields (like chain code and residue serial) |
15 |
* and mapping to pdb fields would be necessary for the graph visualisation |
16 |
* if the original pdb files were preferred. |
17 |
* |
18 |
* Notes: |
19 |
* - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of |
20 |
* amino acids, only the default location is considered (alt_code = "A") |
21 |
* (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/) |
22 |
* - The filename is either accessionCode_assemblyId_modelId.pdb (biological unit) |
23 |
* or accessionCode.pdb (asu) |
24 |
* - There is also the oportunity to send the atom lines directly to PyMol and |
25 |
* loading the structure without intermediate files. Look at PyMol class and |
26 |
* sendAtomLines method. |
27 |
* |
28 |
* Changelog: |
29 |
* 21/03/06 first created by IF |
30 |
*/ |
31 |
|
32 |
public class Msdsd2Pdb { |
33 |
|
34 |
/** |
35 |
* exports to file the atom lines of a model (modelId) of a biological unit (assemblyId) |
36 |
* of a protein (accessionCode) directly from msdsd. The filename is returned. |
37 |
* |
38 |
* Notes: |
39 |
* - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of |
40 |
* amino acids, only the default location is considered (alt_code = "A") |
41 |
* (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/) |
42 |
* - The filename is accessionCode_assemblyId_modelId.pdb (biological unit). |
43 |
* - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used |
44 |
* the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol |
45 |
* selections. |
46 |
* - There are two versions of export2File. One that takes the atomic coordinates from the |
47 |
* partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster), |
48 |
* while the other uses the merged table (really slow - should be avoided) |
49 |
*/ |
50 |
public static String export2File(String accessionCode, int assemblyId, int modelId, String pdbDir) { |
51 |
|
52 |
String pdbFileName = accessionCode+"_"+assemblyId+"_"+modelId+".pdb"; |
53 |
String query = "SELECT CONCAT("+ |
54 |
"RPAD(\"ATOM\", 6, \" \"), "+ |
55 |
"LPAD(serial, 5, \" \"), "+ |
56 |
"\" \", "+ |
57 |
"LPAD(chem_atom_name, 4, \" \"), "+ |
58 |
"IF(alt_code IS NULL, \" \", alt_code), "+ |
59 |
"code_3_letter, "+ |
60 |
"\" \", "+ |
61 |
"IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+ |
62 |
"LPAD(residue_serial, 4, \" \"), "+ |
63 |
"IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+ |
64 |
"REPEAT(\" \", 3), "+ |
65 |
"LPAD(x, 8, \" \"), "+ |
66 |
"LPAD(y, 8, \" \"), "+ |
67 |
"LPAD(z, 8, \" \"), "+ |
68 |
"LPAD(occupancy, 6, \" \"), "+ |
69 |
"REPEAT(\" \", 6), "+ |
70 |
"REPEAT(\" \", 6), "+ |
71 |
"RPAD(chain_code, 4, \" \") "+ |
72 |
") AS atom_lines FROM msdsd.atom_data WHERE "+ |
73 |
"(assembly_id = "+assemblyId+") AND "+ |
74 |
"(model_id = "+modelId+") AND "+ |
75 |
"((alt_code = \"A\") OR (alt_code IS NULL)) AND "+ |
76 |
"(pdb_group = \"A\") "+ |
77 |
"ORDER BY chain_code, residue_serial, serial;"; |
78 |
|
79 |
gen(pdbDir+"/"+pdbFileName, query); |
80 |
|
81 |
return pdbFileName; |
82 |
|
83 |
} |
84 |
|
85 |
/** |
86 |
* exports to file the atom lines of a model (modelId) of a biological unit (assemblyId) |
87 |
* of a protein (accessionCode) directly from msdsd. The filename is returned. |
88 |
* |
89 |
* Notes: |
90 |
* - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of |
91 |
* amino acids, only the default location is considered (alt_code = "A") |
92 |
* (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/) |
93 |
* - The filename is accessionCode_assemblyId_modelId.pdb (biological unit). |
94 |
* - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used |
95 |
* the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol |
96 |
* selections. |
97 |
* - There are two versions of export2File. One that takes the atomic coordinates from the |
98 |
* partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster), |
99 |
* while the other uses the merged table (really slow - should be avoided) |
100 |
*/ |
101 |
public static String export2File(String accessionCode, int assemblyId, int modelId, int atomDataTblNum, String pdbDir) { |
102 |
|
103 |
String pdbFileName = accessionCode+"_"+assemblyId+"_"+modelId+".pdb"; |
104 |
String query = "SELECT CONCAT("+ |
105 |
"RPAD(\"ATOM\", 6, \" \"), "+ |
106 |
"LPAD(serial, 5, \" \"), "+ |
107 |
"\" \", "+ |
108 |
"LPAD(chem_atom_name, 4, \" \"), "+ |
109 |
"IF(alt_code IS NULL, \" \", alt_code), "+ |
110 |
"code_3_letter, "+ |
111 |
"\" \", "+ |
112 |
"IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+ |
113 |
"LPAD(residue_serial, 4, \" \"), "+ |
114 |
"IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+ |
115 |
"REPEAT(\" \", 3), "+ |
116 |
"LPAD(x, 8, \" \"), "+ |
117 |
"LPAD(y, 8, \" \"), "+ |
118 |
"LPAD(z, 8, \" \"), "+ |
119 |
"LPAD(occupancy, 6, \" \"), "+ |
120 |
"REPEAT(\" \", 6), "+ |
121 |
"REPEAT(\" \", 6), "+ |
122 |
"RPAD(chain_code, 4, \" \") "+ |
123 |
") AS atom_lines FROM msdsd.atom_data_"+atomDataTblNum+" WHERE "+ |
124 |
"(assembly_id = "+assemblyId+") AND "+ |
125 |
"(model_id = "+modelId+") AND "+ |
126 |
"((alt_code = \"A\") OR (alt_code IS NULL)) AND "+ |
127 |
"(pdb_group = \"A\") "+ |
128 |
"ORDER BY chain_code, residue_serial, serial;"; |
129 |
|
130 |
gen(pdbDir+"/"+pdbFileName, query); |
131 |
|
132 |
return pdbFileName; |
133 |
|
134 |
} |
135 |
|
136 |
/** |
137 |
* exports to file the atom lines of the assymetric unit of a protein (accessionCode) |
138 |
* directly from msdsd. The filename is returned. |
139 |
* |
140 |
* Notes: |
141 |
* - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of |
142 |
* amino acids, only the default location is considered (alt_code = "A") |
143 |
* (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/) |
144 |
* - The filename is accessionCode.pdb (asu). |
145 |
* - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used |
146 |
* the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol |
147 |
* selections. |
148 |
* - There are two versions of export2File. One that takes the atomic coordinates from the |
149 |
* partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster), |
150 |
* while the other uses the merged table (really slow - should be avoided) |
151 |
*/ |
152 |
public static String export2File(String accessionCode, String pdbDir) { |
153 |
|
154 |
String pdbFileName = accessionCode+".pdb"; |
155 |
String query = "SELECT CONCAT("+ |
156 |
"RPAD(\"ATOM\", 6, \" \"), "+ |
157 |
"LPAD(serial, 5, \" \"), "+ |
158 |
"\" \", "+ |
159 |
"LPAD(chem_atom_name, 4, \" \"), "+ |
160 |
"IF(alt_code IS NULL, \" \", alt_code), "+ |
161 |
"code_3_letter, "+ |
162 |
"\" \", "+ |
163 |
"IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+ |
164 |
"LPAD(residue_serial, 4, \" \"), "+ |
165 |
"IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+ |
166 |
"REPEAT(\" \", 3), "+ |
167 |
"LPAD(x, 8, \" \"), "+ |
168 |
"LPAD(y, 8, \" \"), "+ |
169 |
"LPAD(z, 8, \" \"), "+ |
170 |
"LPAD(occupancy, 6, \" \"), "+ |
171 |
"REPEAT(\" \", 6), "+ |
172 |
"REPEAT(\" \", 6), "+ |
173 |
"RPAD(chain_code, 4, \" \") "+ |
174 |
") AS atom_lines FROM msdsd.atom_data WHERE "+ |
175 |
"(accession_code = \""+accessionCode+"\") AND "+ |
176 |
"(non_assembly_valid = \"Y\") AND "+ |
177 |
"((alt_code = \"A\") OR (alt_code IS NULL)) AND "+ |
178 |
"(pdb_group = \"A\") "+ |
179 |
"ORDER BY chain_code, residue_serial, serial;"; |
180 |
|
181 |
gen(pdbDir+"/"+pdbFileName, query); |
182 |
|
183 |
return pdbFileName; |
184 |
|
185 |
} |
186 |
|
187 |
/** |
188 |
* exports to file the atom lines of the assymetric unit of a protein (accessionCode) |
189 |
* directly from msdsd. The filename is returned. |
190 |
* |
191 |
* Notes: |
192 |
* - Hetatoms are excluded (pdb_group = "A") and in case of multiple locations of |
193 |
* amino acids, only the default location is considered (alt_code = "A") |
194 |
* (VendruscoloM_00_PSFG.pdf currently found in LitNet/incomingPDF/LAST_ROUND/) |
195 |
* - The filename is accessionCode.pdb (asu). |
196 |
* - The chain_pdb_code is used in the chainID field in the atom line, while the chain_code is used |
197 |
* the segID field (due to its length). Therefore, "segi" and not "chain" must be used in pymol |
198 |
* selections. |
199 |
* - There are two versions of export2File. One that takes the atomic coordinates from the |
200 |
* partial atom_data tables (needs the table number e.g. 1 for atom_data_1, but is faster), |
201 |
* while the other uses the merged table (really slow - should be avoided) |
202 |
*/ |
203 |
public static String export2File(String accessionCode, int atomDataTblNum, String pdbDir) { |
204 |
|
205 |
String pdbFileName = accessionCode+".pdb"; |
206 |
String query = "SELECT CONCAT("+ |
207 |
"RPAD(\"ATOM\", 6, \" \"), "+ |
208 |
"LPAD(serial, 5, \" \"), "+ |
209 |
"\" \", "+ |
210 |
"LPAD(chem_atom_name, 4, \" \"), "+ |
211 |
"IF(alt_code IS NULL, \" \", alt_code), "+ |
212 |
"code_3_letter, "+ |
213 |
"\" \", "+ |
214 |
"IF(chain_pdb_code IS NULL, \" \", chain_pdb_code), "+ |
215 |
"LPAD(residue_serial, 4, \" \"), "+ |
216 |
"IF(residue_pdb_insert_code IS NULL, \" \", residue_pdb_insert_code), "+ |
217 |
"REPEAT(\" \", 3), "+ |
218 |
"LPAD(x, 8, \" \"), "+ |
219 |
"LPAD(y, 8, \" \"), "+ |
220 |
"LPAD(z, 8, \" \"), "+ |
221 |
"LPAD(occupancy, 6, \" \"), "+ |
222 |
"REPEAT(\" \", 6), "+ |
223 |
"REPEAT(\" \", 6), "+ |
224 |
"RPAD(chain_code, 4, \" \") "+ |
225 |
") AS atom_lines FROM msdsd.atom_data_"+atomDataTblNum+" WHERE "+ |
226 |
"(accession_code = \""+accessionCode+"\") AND "+ |
227 |
"(non_assembly_valid = \"Y\") AND "+ |
228 |
"((alt_code = \"A\") OR (alt_code IS NULL)) AND "+ |
229 |
"(pdb_group = \"A\") "+ |
230 |
"ORDER BY chain_code, residue_serial, serial;"; |
231 |
|
232 |
gen(pdbDir+"/"+pdbFileName, query); |
233 |
|
234 |
return pdbFileName; |
235 |
|
236 |
} |
237 |
|
238 |
/** |
239 |
* creates a temporary sql script called export.sql with the necessary sql query to dump the atom lines, |
240 |
* executes the script redirecting the output to a file and deletes the sql script |
241 |
*/ |
242 |
private static void gen(String pdbFileName, String query) { |
243 |
|
244 |
try { |
245 |
|
246 |
File sqlScript = new File("export.sql"); |
247 |
PrintWriter scriptOut = new PrintWriter(new FileWriter(sqlScript)); |
248 |
scriptOut.println(query); |
249 |
if (scriptOut != null) { scriptOut.close(); } |
250 |
|
251 |
System.out.println(SystemCmd.exec(new String[] {"/bin/sh", "-c", "my_lila < export.sql > "+pdbFileName})); |
252 |
|
253 |
sqlScript.delete(); |
254 |
|
255 |
} catch (Exception e) { |
256 |
System.out.println(e); |
257 |
} |
258 |
|
259 |
} |
260 |
|
261 |
} // end of class Msdsd2Pdb |