1 |
duarte |
618 |
package sequence; |
2 |
|
|
|
3 |
|
|
import java.sql.SQLException; |
4 |
|
|
import java.util.TreeMap; |
5 |
duarte |
619 |
import java.util.regex.Matcher; |
6 |
|
|
import java.util.regex.Pattern; |
7 |
duarte |
618 |
|
8 |
|
|
import proteinstructure.PdbCodeNotFoundError; |
9 |
|
|
import proteinstructure.PdbLoadError; |
10 |
|
|
import proteinstructure.PdbasePdb; |
11 |
|
|
import tools.MySQLConnection; |
12 |
|
|
|
13 |
|
|
/** |
14 |
|
|
* A GTG output hit |
15 |
|
|
* |
16 |
|
|
*/ |
17 |
|
|
public class GTGHit { |
18 |
|
|
|
19 |
duarte |
619 |
private static final String NULL_PDBCHAINCODE = "A"; |
20 |
|
|
|
21 |
|
|
private static final String ID_REGEX = "(\\d\\w\\w\\w)(\\w)"; |
22 |
duarte |
618 |
|
23 |
|
|
String queryId; |
24 |
|
|
String subjectId; |
25 |
|
|
|
26 |
|
|
int queryStart; |
27 |
|
|
int queryEnd; |
28 |
|
|
int subjectStart; |
29 |
|
|
int subjectEnd; |
30 |
|
|
|
31 |
|
|
int identities; |
32 |
|
|
//int missmatches; // do we want this? |
33 |
|
|
int alnLength; |
34 |
|
|
// alignment // do we want to store the full alignment or is it enough with the above? |
35 |
|
|
|
36 |
|
|
int totalScore; |
37 |
|
|
int consistencyScore; |
38 |
|
|
int motifScore; |
39 |
|
|
|
40 |
|
|
String subjectSequence; |
41 |
|
|
|
42 |
|
|
|
43 |
|
|
|
44 |
|
|
public GTGHit(String queryId, String subjectId, int totalScore, int consistencyScore, int motifScore) { |
45 |
|
|
this.queryId = queryId; |
46 |
|
|
this.subjectId = subjectId; |
47 |
|
|
|
48 |
|
|
// We asume here that the subjectId is a PDB id in the form pdbCode+pdbChainCode |
49 |
|
|
// It seems that GTG uses pre-remediated PDB data: we have to add an "A" when length of subjectId is only 4, i.e. missing (NULL) chain code |
50 |
|
|
if (this.subjectId.length()==4) { |
51 |
|
|
this.subjectId += NULL_PDBCHAINCODE; |
52 |
|
|
} |
53 |
|
|
|
54 |
|
|
this.totalScore = totalScore; |
55 |
|
|
this.consistencyScore = consistencyScore; |
56 |
|
|
this.motifScore = motifScore; |
57 |
|
|
|
58 |
|
|
} |
59 |
|
|
|
60 |
duarte |
619 |
public int getTotalScore() { |
61 |
|
|
return this.totalScore; |
62 |
|
|
} |
63 |
|
|
|
64 |
duarte |
618 |
public void setQueryStart(int queryStart) { |
65 |
|
|
this.queryStart = queryStart; |
66 |
|
|
} |
67 |
|
|
|
68 |
|
|
public void setQueryEnd(int queryEnd) { |
69 |
|
|
this.queryEnd = queryEnd; |
70 |
|
|
} |
71 |
|
|
|
72 |
|
|
public void setSubjectStart(int subjectStart) { |
73 |
|
|
this.subjectStart = subjectStart; |
74 |
|
|
} |
75 |
|
|
|
76 |
|
|
public void setSubjectEnd(int subjectEnd) { |
77 |
|
|
this.subjectEnd = subjectEnd; |
78 |
|
|
} |
79 |
|
|
|
80 |
|
|
public void setSubjectSequence(String subjectSequence) { |
81 |
|
|
this.subjectSequence = subjectSequence; |
82 |
|
|
} |
83 |
|
|
|
84 |
|
|
public String getSubjectSequence() { |
85 |
|
|
return subjectSequence; |
86 |
|
|
} |
87 |
|
|
|
88 |
|
|
public void setIdentities(int identities) { |
89 |
|
|
this.identities = identities; |
90 |
|
|
} |
91 |
|
|
|
92 |
|
|
public void setAlnLength(int alnLength) { |
93 |
|
|
this.alnLength = alnLength; |
94 |
|
|
} |
95 |
|
|
|
96 |
|
|
public double getPercentIdentity() { |
97 |
|
|
return ((double) identities/alnLength)*100; |
98 |
|
|
} |
99 |
|
|
|
100 |
|
|
public void print() { |
101 |
|
|
System.out.printf("%8s %8s %6d %6d %6d %6d %6d %6d %5.1f %6d %6d %6d \n", |
102 |
|
|
queryId, subjectId, queryStart, queryEnd, subjectStart, subjectEnd, identities, alnLength, getPercentIdentity(),totalScore, consistencyScore, motifScore); |
103 |
|
|
} |
104 |
|
|
|
105 |
|
|
/** |
106 |
|
|
* Checks the subject sequence and compares it to the pdbase sequence, |
107 |
|
|
* provided that the subjectId is a PDB identifier in the form pdbCode+pdbChainCode |
108 |
|
|
* @param conn |
109 |
|
|
* @param pdbaseDb |
110 |
|
|
* @return false if sequences don't match, true if they do |
111 |
|
|
* @throws PdbCodeNotFoundError |
112 |
|
|
* @throws SQLException |
113 |
|
|
* @throws PdbLoadError |
114 |
|
|
*/ |
115 |
|
|
public boolean checkSubjectSequence(MySQLConnection conn, String pdbaseDb) throws PdbCodeNotFoundError, SQLException, PdbLoadError { |
116 |
|
|
if (subjectSequence.equals("")) { |
117 |
|
|
System.err.println("Subject sequence is empty"); |
118 |
|
|
return false; |
119 |
|
|
} |
120 |
|
|
String pdbCode = subjectId.substring(0, 4); |
121 |
|
|
String pdbChainCode = subjectId.substring(4); |
122 |
|
|
PdbasePdb pdb = new PdbasePdb(pdbCode, pdbaseDb, conn); |
123 |
|
|
pdb.load(pdbChainCode); |
124 |
|
|
TreeMap<Integer, Integer> mapObsSerials2resser = pdb.getObservedResMapping(); |
125 |
|
|
String pdbaseSequence = pdb.getSequence(); |
126 |
|
|
|
127 |
|
|
for (int i=0; i<subjectSequence.length();i++) { |
128 |
|
|
int indexInFullSeq = mapObsSerials2resser.get(i+1)-1; |
129 |
|
|
if (subjectSequence.charAt(i)!=pdbaseSequence.charAt(indexInFullSeq)) { |
130 |
|
|
System.err.println("GTG output subject's sequence doesn't match sequence from pdbase at position "+(i+1)+"(internal res. serial: "+(indexInFullSeq+1)+"). subject: "+subjectSequence.charAt(i)+", pdbase: "+pdbaseSequence.charAt(indexInFullSeq)); |
131 |
|
|
return false; |
132 |
|
|
} |
133 |
|
|
} |
134 |
|
|
|
135 |
|
|
// the following would have worked as well. But our getObservedSequence() was getting observed sequences without non-standard aas, so the matching of sequences was failing a lot |
136 |
|
|
// String pdbaseSequence = pdb.getObservedSequence(); |
137 |
|
|
// |
138 |
|
|
// // 1st check: subject sequence must not be longer than full sequence from pdbase |
139 |
|
|
// // this also prevents for getting an out of bound index error in following loop |
140 |
|
|
// if (subjectSequence.length()>pdbaseSequence.length()) { |
141 |
|
|
// System.err.println("GTG output subject's sequence is longer than sequence from pdbase."); |
142 |
|
|
// return false; |
143 |
|
|
// } |
144 |
|
|
// |
145 |
|
|
// for (int i=0; i<subjectSequence.length();i++) { |
146 |
|
|
// // 2nd check: all residues match |
147 |
|
|
// if (subjectSequence.charAt(i)!=pdbaseSequence.charAt(i)) { |
148 |
|
|
// System.err.println("GTG output subject's sequence doesn't match sequence from pdbase at position "+(i+1)+". subject: "+subjectSequence.charAt(i)+", pdbase: "+pdbaseSequence.charAt(i)); |
149 |
|
|
// return false; |
150 |
|
|
// } |
151 |
|
|
// } |
152 |
|
|
return true; |
153 |
|
|
} |
154 |
|
|
|
155 |
|
|
/** |
156 |
|
|
* Reassigns the serials of subjectStart and subjectEnd positions based on the |
157 |
|
|
* pdbase full SEQRES sequence, i.e. converts GTG serials to our internal residue |
158 |
|
|
* serials (cif serials) |
159 |
|
|
* NOTE: there are some data compatibility issues with the output of GTG. Sometimes |
160 |
|
|
* our mapping of observed sequence serials (numbering used by GTG) and pdbase full |
161 |
|
|
* sequence is not perfect, so it might be off a few residues at most. Anyway this |
162 |
|
|
* happens in extremely rare cases where a residue is considered observed in pdbase |
163 |
|
|
* and not by GTG (examples are: 1fdpB residue 136, 1tonA residue 79) |
164 |
|
|
* @param conn |
165 |
|
|
* @param pdbaseDb |
166 |
|
|
* @throws SQLException |
167 |
|
|
* @throws PdbCodeNotFoundError |
168 |
|
|
* @throws PdbLoadError |
169 |
|
|
*/ |
170 |
|
|
public void reassignSubjectSerials(MySQLConnection conn, String pdbaseDb) throws SQLException, PdbCodeNotFoundError, PdbLoadError { |
171 |
|
|
if (subjectStart==0 && subjectEnd ==0 ) return; // subject start and end not known: nothing to do |
172 |
|
|
|
173 |
|
|
String pdbCode = subjectId.substring(0, 4); |
174 |
|
|
String pdbChainCode = subjectId.substring(4); |
175 |
|
|
PdbasePdb pdb = new PdbasePdb(pdbCode, pdbaseDb, conn); |
176 |
|
|
pdb.load(pdbChainCode); |
177 |
|
|
TreeMap<Integer, Integer> mapObsSerials2resser = pdb.getObservedResMapping(); |
178 |
|
|
|
179 |
duarte |
623 |
if (!mapObsSerials2resser.containsKey(subjectStart) || !mapObsSerials2resser.containsKey(subjectEnd)) { |
180 |
|
|
System.err.println("Cannot map observed residue sequence serials to internal residue serials for "+subjectId+": either subject start or end serial are out of range"); |
181 |
|
|
return; // can't map: return |
182 |
|
|
} |
183 |
duarte |
618 |
subjectStart = mapObsSerials2resser.get(subjectStart); |
184 |
|
|
subjectEnd = mapObsSerials2resser.get(subjectEnd); |
185 |
|
|
} |
186 |
duarte |
619 |
|
187 |
|
|
/** |
188 |
|
|
* Prints a few selected fields for this GTG hit plus a graphical representation of the match. |
189 |
|
|
* The match is scaled by the given scale factor and rounded to screen columns. |
190 |
|
|
* @param scaleFactor |
191 |
|
|
*/ |
192 |
|
|
public void printWithOverview(double scaleFactor) { |
193 |
|
|
System.out.printf("%5s\t%5s\t%5.1f\t%7d ", queryId, subjectId, getPercentIdentity(), totalScore); |
194 |
|
|
int beg = (int) Math.floor(scaleFactor * this.queryStart); |
195 |
|
|
int end = (int) Math.ceil(scaleFactor * this.queryEnd); |
196 |
|
|
printOverviewLine(beg, end); |
197 |
|
|
} |
198 |
|
|
|
199 |
|
|
/** |
200 |
|
|
* Print the column headers corresponding to the printWithOverview() method. |
201 |
|
|
* Additionally prints a graphical overview of the query (queryLength scaled by scaleFactor). |
202 |
|
|
* @param queryLength |
203 |
|
|
* @param scaleFactor |
204 |
|
|
*/ |
205 |
|
|
public static void printHeaderWithOverview(int queryLength, double scaleFactor) { |
206 |
|
|
System.out.printf("%5s\t%5s\t%5s\t%7s ", "query", "sbjct", "id%", "sc"); |
207 |
|
|
int beg = 1; |
208 |
|
|
int end = (int) Math.ceil(scaleFactor * queryLength); |
209 |
|
|
printOverviewLine(beg, end); |
210 |
|
|
} |
211 |
|
|
|
212 |
|
|
/** |
213 |
|
|
* Print one line of the match overview. |
214 |
|
|
* @param beg the beginning of the match in screen columns |
215 |
|
|
* @param end the end of the match in screen columns |
216 |
|
|
*/ |
217 |
|
|
private static void printOverviewLine(int beg, int end) { |
218 |
|
|
for (int i = 1; i < beg; i++) { |
219 |
|
|
System.out.print(" "); |
220 |
|
|
} |
221 |
|
|
for (int i = beg; i <= end; i++) { |
222 |
|
|
System.out.print("-"); |
223 |
|
|
} |
224 |
|
|
System.out.println(); |
225 |
|
|
} |
226 |
|
|
|
227 |
|
|
/** |
228 |
duarte |
622 |
* Returns the template id as concatenated pdbCode+pdbChainCode e.g. 1abcA |
229 |
duarte |
619 |
* @return the template id or null if queryId is not in the right format |
230 |
|
|
*/ |
231 |
|
|
public String getTemplateId() { |
232 |
|
|
Pattern p = Pattern.compile(ID_REGEX); |
233 |
|
|
Matcher m = p.matcher(subjectId); |
234 |
|
|
if (m.matches()) { |
235 |
|
|
return m.group(1).toLowerCase()+m.group(2).toUpperCase(); |
236 |
|
|
} |
237 |
|
|
return null; |
238 |
|
|
} |
239 |
|
|
|
240 |
duarte |
618 |
} |