1 |
package proteinstructure; |
2 |
|
3 |
import java.io.BufferedReader; |
4 |
import java.io.File; |
5 |
import java.io.FileNotFoundException; |
6 |
import java.io.FileReader; |
7 |
import java.io.IOException; |
8 |
import java.util.ArrayList; |
9 |
import java.util.Collections; |
10 |
import java.util.HashMap; |
11 |
import java.util.regex.Matcher; |
12 |
import java.util.regex.Pattern; |
13 |
|
14 |
public class PdbfilePdb extends Pdb { |
15 |
|
16 |
private static final String UNKNOWN_STRING ="Unknown"; |
17 |
private static final String NULL_chainCode = "A"; |
18 |
|
19 |
private String pdbfile; |
20 |
|
21 |
/** |
22 |
* Constructs Pdb object reading from pdb file given pdb chain code. Model will be DEFAULT_MODEL |
23 |
* @param pdbfile |
24 |
* @param pdbChainCode |
25 |
* @throws FileNotFoundException |
26 |
* @throws IOException |
27 |
* @throws PdbfileFormatError |
28 |
* @throws PdbChainCodeNotFoundError |
29 |
*/ |
30 |
public PdbfilePdb (String pdbfile, String pdbChainCode) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
31 |
this(pdbfile,pdbChainCode,DEFAULT_MODEL); |
32 |
} |
33 |
|
34 |
/** |
35 |
* Constructs Pdb object reading from pdb file given pdb chain code and model serial |
36 |
* @param pdbfile |
37 |
* @param pdbChainCode |
38 |
* @param model_serial |
39 |
* @throws FileNotFoundException |
40 |
* @throws IOException |
41 |
* @throws PdbfileFormatError |
42 |
* @throws PdbChainCodeNotFoundError |
43 |
*/ |
44 |
public PdbfilePdb (String pdbfile, String pdbChainCode, int model_serial) throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
45 |
this.pdbfile = pdbfile; |
46 |
this.model=model_serial; |
47 |
this.pdbCode=UNKNOWN_STRING; // we initialise to unknown in case we don't find it in pdb file |
48 |
this.pdbChainCode=pdbChainCode; |
49 |
// we set chainCode to pdbChainCode except for case NULL where we use "A" |
50 |
this.chainCode=pdbChainCode; |
51 |
if (pdbChainCode.equals("NULL")) this.chainCode=NULL_chainCode; |
52 |
|
53 |
this.sequence=""; // we initialise it to empty string, then is set inread_pdb_data_from_file |
54 |
|
55 |
read_pdb_data_from_file(); |
56 |
|
57 |
// when reading from pdb file we have no information of residue numbers or author's (original) pdb residue number, so we fill the mapping with the residue numbers we know |
58 |
//TODO eventually we could assign our own internal residue numbers when reading from pdb and thus this map would be used |
59 |
this.resser2pdbresser = new HashMap<Integer, String>(); |
60 |
this.pdbresser2resser = new HashMap<String, Integer>(); |
61 |
for (int resser:resser2restype.keySet()){ |
62 |
resser2pdbresser.put(resser, String.valueOf(resser)); |
63 |
pdbresser2resser.put(String.valueOf(resser), resser); |
64 |
} |
65 |
} |
66 |
|
67 |
|
68 |
|
69 |
/** |
70 |
* To read the pdb data (atom coordinates, residue serials, atom serials) from file |
71 |
* chainCode gets set to internal identifier: if input chain code NULL then chainCode will be ' ' |
72 |
* pdbCode gets set to the one parsed in HEADER or to 'Unknown' if not found |
73 |
* sequence gets set to the sequence read from ATOM lines (i.e. observed resdiues only) |
74 |
* @param pdbfile |
75 |
* @throws FileNotFoundException |
76 |
* @throws IOException |
77 |
* @throws PdbfileFormatError |
78 |
* @throws PdbChainCodeNotFoundError |
79 |
*/ |
80 |
private void read_pdb_data_from_file() throws FileNotFoundException, IOException, PdbfileFormatError, PdbChainCodeNotFoundError { |
81 |
resser_atom2atomserial = new HashMap<String,Integer>(); |
82 |
resser2restype = new HashMap<Integer,String>(); |
83 |
atomser2coord = new HashMap<Integer,Double[]>(); |
84 |
atomser2resser = new HashMap<Integer,Integer>(); |
85 |
boolean empty = true; // controls whether we don't find any atom line for given pdbChainCode and model |
86 |
// we set chainCodeStr (for regex) to pdbChainCode except for case NULL where we use " " (NULL is a blank chain code in pdb files) |
87 |
String chainCodeStr=pdbChainCode; |
88 |
if (pdbChainCode.equals("NULL")) chainCodeStr=" "; |
89 |
|
90 |
int thismodel=DEFAULT_MODEL; // we initialise to DEFAULT_MODEL, in case file doesn't have MODEL lines |
91 |
BufferedReader fpdb = new BufferedReader(new FileReader(new File(pdbfile))); |
92 |
int linecount=0; |
93 |
String line; |
94 |
while ((line = fpdb.readLine() ) != null ) { |
95 |
linecount++; |
96 |
Pattern p = Pattern.compile("^HEADER"); |
97 |
Matcher m = p.matcher(line); |
98 |
if (m.find()){ |
99 |
Pattern ph = Pattern.compile("^HEADER.{56}(\\d\\w{3})"); |
100 |
Matcher mh = ph.matcher(line); |
101 |
if (mh.find()) { |
102 |
pdbCode=mh.group(1).toLowerCase(); |
103 |
} |
104 |
} else if (linecount==1) { // header not found and we are in line 1 |
105 |
// a HEADER is the minimum we ask at the moment for a pdb file to have, if we don't find it in line 1 we throw an exception |
106 |
throw new PdbfileFormatError("The pdb file "+pdbfile+" doesn't seem to have the right format"); |
107 |
} |
108 |
p = Pattern.compile("^MODEL\\s+(\\d+)"); |
109 |
m = p.matcher(line); |
110 |
if (m.find()){ |
111 |
thismodel=Integer.parseInt(m.group(1)); |
112 |
} |
113 |
if (thismodel!=model) continue; // we skip reading of atom lines if we are not in the desired model |
114 |
p = Pattern.compile("^ATOM"); |
115 |
m = p.matcher(line); |
116 |
if (m.find()){ |
117 |
// serial atom res_type chain res_ser x y z |
118 |
Pattern pl = Pattern.compile(".{6}(.....).{2}(...).{1}(...).{1}"+chainCodeStr+"(.{4}).{4}(.{8})(.{8})(.{8})",Pattern.CASE_INSENSITIVE); |
119 |
Matcher ml = pl.matcher(line); |
120 |
if (ml.find()) { |
121 |
empty=false; |
122 |
int atomserial=Integer.parseInt(ml.group(1).trim()); |
123 |
String atom = ml.group(2).trim(); |
124 |
String res_type = ml.group(3).trim(); |
125 |
int res_serial = Integer.parseInt(ml.group(4).trim()); |
126 |
double x = Double.parseDouble(ml.group(5).trim()); |
127 |
double y = Double.parseDouble(ml.group(6).trim()); |
128 |
double z = Double.parseDouble(ml.group(7).trim()); |
129 |
Double[] coords = {x, y, z}; |
130 |
ArrayList<String> aalist=AA.aas(); |
131 |
if (aalist.contains(res_type)) { |
132 |
atomser2coord.put(atomserial, coords); |
133 |
atomser2resser.put(atomserial, res_serial); |
134 |
resser2restype.put(res_serial, res_type); |
135 |
ArrayList<String> atomlist = aas2atoms.get(res_type); |
136 |
if (atomlist.contains(atom)){ |
137 |
resser_atom2atomserial.put(res_serial+"_"+atom, atomserial); |
138 |
} |
139 |
} |
140 |
} |
141 |
} |
142 |
} |
143 |
fpdb.close(); |
144 |
if (empty) { |
145 |
//System.err.println("Couldn't find any atom line for given pdbChainCode: "+pdbChainCode+", model: "+model); |
146 |
throw new PdbChainCodeNotFoundError("Couldn't find any ATOM line for given pdbChainCode: "+pdbChainCode+", model: "+model); |
147 |
} |
148 |
// now we read the sequence from the resser2restype HashMap |
149 |
// NOTE: we must make sure elsewhere that there are no unobserved residues, we can't check that here! |
150 |
ArrayList<Integer> ressers = new ArrayList<Integer>(); |
151 |
for (int resser:resser2restype.keySet()) { |
152 |
ressers.add(resser); |
153 |
} |
154 |
Collections.sort(ressers); |
155 |
for (int resser:ressers){ |
156 |
String oneletter = AA.threeletter2oneletter(resser2restype.get(resser)); |
157 |
sequence += oneletter; |
158 |
} |
159 |
} |
160 |
|
161 |
} |