ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/owl/trunk/sequence/GTGHit.java
Revision: 623
Committed: Fri May 9 14:10:27 2008 UTC (16 years, 4 months ago) by duarte
File size: 8271 byte(s)
Log Message:
Fixed bug: now catching when the mapping of observed residue sequence serials to internal residue serials is out of range, which happens in some extremely rare cases (because GTG uses pre-remediation data)
Line User Rev File contents
1 duarte 618 package sequence;
2    
3     import java.sql.SQLException;
4     import java.util.TreeMap;
5 duarte 619 import java.util.regex.Matcher;
6     import java.util.regex.Pattern;
7 duarte 618
8     import proteinstructure.PdbCodeNotFoundError;
9     import proteinstructure.PdbLoadError;
10     import proteinstructure.PdbasePdb;
11     import tools.MySQLConnection;
12    
13     /**
14     * A GTG output hit
15     *
16     */
17     public class GTGHit {
18    
19 duarte 619 private static final String NULL_PDBCHAINCODE = "A";
20    
21     private static final String ID_REGEX = "(\\d\\w\\w\\w)(\\w)";
22 duarte 618
23     String queryId;
24     String subjectId;
25    
26     int queryStart;
27     int queryEnd;
28     int subjectStart;
29     int subjectEnd;
30    
31     int identities;
32     //int missmatches; // do we want this?
33     int alnLength;
34     // alignment // do we want to store the full alignment or is it enough with the above?
35    
36     int totalScore;
37     int consistencyScore;
38     int motifScore;
39    
40     String subjectSequence;
41    
42    
43    
44     public GTGHit(String queryId, String subjectId, int totalScore, int consistencyScore, int motifScore) {
45     this.queryId = queryId;
46     this.subjectId = subjectId;
47    
48     // We asume here that the subjectId is a PDB id in the form pdbCode+pdbChainCode
49     // It seems that GTG uses pre-remediated PDB data: we have to add an "A" when length of subjectId is only 4, i.e. missing (NULL) chain code
50     if (this.subjectId.length()==4) {
51     this.subjectId += NULL_PDBCHAINCODE;
52     }
53    
54     this.totalScore = totalScore;
55     this.consistencyScore = consistencyScore;
56     this.motifScore = motifScore;
57    
58     }
59    
60 duarte 619 public int getTotalScore() {
61     return this.totalScore;
62     }
63    
64 duarte 618 public void setQueryStart(int queryStart) {
65     this.queryStart = queryStart;
66     }
67    
68     public void setQueryEnd(int queryEnd) {
69     this.queryEnd = queryEnd;
70     }
71    
72     public void setSubjectStart(int subjectStart) {
73     this.subjectStart = subjectStart;
74     }
75    
76     public void setSubjectEnd(int subjectEnd) {
77     this.subjectEnd = subjectEnd;
78     }
79    
80     public void setSubjectSequence(String subjectSequence) {
81     this.subjectSequence = subjectSequence;
82     }
83    
84     public String getSubjectSequence() {
85     return subjectSequence;
86     }
87    
88     public void setIdentities(int identities) {
89     this.identities = identities;
90     }
91    
92     public void setAlnLength(int alnLength) {
93     this.alnLength = alnLength;
94     }
95    
96     public double getPercentIdentity() {
97     return ((double) identities/alnLength)*100;
98     }
99    
100     public void print() {
101     System.out.printf("%8s %8s %6d %6d %6d %6d %6d %6d %5.1f %6d %6d %6d \n",
102     queryId, subjectId, queryStart, queryEnd, subjectStart, subjectEnd, identities, alnLength, getPercentIdentity(),totalScore, consistencyScore, motifScore);
103     }
104    
105     /**
106     * Checks the subject sequence and compares it to the pdbase sequence,
107     * provided that the subjectId is a PDB identifier in the form pdbCode+pdbChainCode
108     * @param conn
109     * @param pdbaseDb
110     * @return false if sequences don't match, true if they do
111     * @throws PdbCodeNotFoundError
112     * @throws SQLException
113     * @throws PdbLoadError
114     */
115     public boolean checkSubjectSequence(MySQLConnection conn, String pdbaseDb) throws PdbCodeNotFoundError, SQLException, PdbLoadError {
116     if (subjectSequence.equals("")) {
117     System.err.println("Subject sequence is empty");
118     return false;
119     }
120     String pdbCode = subjectId.substring(0, 4);
121     String pdbChainCode = subjectId.substring(4);
122     PdbasePdb pdb = new PdbasePdb(pdbCode, pdbaseDb, conn);
123     pdb.load(pdbChainCode);
124     TreeMap<Integer, Integer> mapObsSerials2resser = pdb.getObservedResMapping();
125     String pdbaseSequence = pdb.getSequence();
126    
127     for (int i=0; i<subjectSequence.length();i++) {
128     int indexInFullSeq = mapObsSerials2resser.get(i+1)-1;
129     if (subjectSequence.charAt(i)!=pdbaseSequence.charAt(indexInFullSeq)) {
130     System.err.println("GTG output subject's sequence doesn't match sequence from pdbase at position "+(i+1)+"(internal res. serial: "+(indexInFullSeq+1)+"). subject: "+subjectSequence.charAt(i)+", pdbase: "+pdbaseSequence.charAt(indexInFullSeq));
131     return false;
132     }
133     }
134    
135     // the following would have worked as well. But our getObservedSequence() was getting observed sequences without non-standard aas, so the matching of sequences was failing a lot
136     // String pdbaseSequence = pdb.getObservedSequence();
137     //
138     // // 1st check: subject sequence must not be longer than full sequence from pdbase
139     // // this also prevents for getting an out of bound index error in following loop
140     // if (subjectSequence.length()>pdbaseSequence.length()) {
141     // System.err.println("GTG output subject's sequence is longer than sequence from pdbase.");
142     // return false;
143     // }
144     //
145     // for (int i=0; i<subjectSequence.length();i++) {
146     // // 2nd check: all residues match
147     // if (subjectSequence.charAt(i)!=pdbaseSequence.charAt(i)) {
148     // System.err.println("GTG output subject's sequence doesn't match sequence from pdbase at position "+(i+1)+". subject: "+subjectSequence.charAt(i)+", pdbase: "+pdbaseSequence.charAt(i));
149     // return false;
150     // }
151     // }
152     return true;
153     }
154    
155     /**
156     * Reassigns the serials of subjectStart and subjectEnd positions based on the
157     * pdbase full SEQRES sequence, i.e. converts GTG serials to our internal residue
158     * serials (cif serials)
159     * NOTE: there are some data compatibility issues with the output of GTG. Sometimes
160     * our mapping of observed sequence serials (numbering used by GTG) and pdbase full
161     * sequence is not perfect, so it might be off a few residues at most. Anyway this
162     * happens in extremely rare cases where a residue is considered observed in pdbase
163     * and not by GTG (examples are: 1fdpB residue 136, 1tonA residue 79)
164     * @param conn
165     * @param pdbaseDb
166     * @throws SQLException
167     * @throws PdbCodeNotFoundError
168     * @throws PdbLoadError
169     */
170     public void reassignSubjectSerials(MySQLConnection conn, String pdbaseDb) throws SQLException, PdbCodeNotFoundError, PdbLoadError {
171     if (subjectStart==0 && subjectEnd ==0 ) return; // subject start and end not known: nothing to do
172    
173     String pdbCode = subjectId.substring(0, 4);
174     String pdbChainCode = subjectId.substring(4);
175     PdbasePdb pdb = new PdbasePdb(pdbCode, pdbaseDb, conn);
176     pdb.load(pdbChainCode);
177     TreeMap<Integer, Integer> mapObsSerials2resser = pdb.getObservedResMapping();
178    
179 duarte 623 if (!mapObsSerials2resser.containsKey(subjectStart) || !mapObsSerials2resser.containsKey(subjectEnd)) {
180     System.err.println("Cannot map observed residue sequence serials to internal residue serials for "+subjectId+": either subject start or end serial are out of range");
181     return; // can't map: return
182     }
183 duarte 618 subjectStart = mapObsSerials2resser.get(subjectStart);
184     subjectEnd = mapObsSerials2resser.get(subjectEnd);
185     }
186 duarte 619
187     /**
188     * Prints a few selected fields for this GTG hit plus a graphical representation of the match.
189     * The match is scaled by the given scale factor and rounded to screen columns.
190     * @param scaleFactor
191     */
192     public void printWithOverview(double scaleFactor) {
193     System.out.printf("%5s\t%5s\t%5.1f\t%7d ", queryId, subjectId, getPercentIdentity(), totalScore);
194     int beg = (int) Math.floor(scaleFactor * this.queryStart);
195     int end = (int) Math.ceil(scaleFactor * this.queryEnd);
196     printOverviewLine(beg, end);
197     }
198    
199     /**
200     * Print the column headers corresponding to the printWithOverview() method.
201     * Additionally prints a graphical overview of the query (queryLength scaled by scaleFactor).
202     * @param queryLength
203     * @param scaleFactor
204     */
205     public static void printHeaderWithOverview(int queryLength, double scaleFactor) {
206     System.out.printf("%5s\t%5s\t%5s\t%7s ", "query", "sbjct", "id%", "sc");
207     int beg = 1;
208     int end = (int) Math.ceil(scaleFactor * queryLength);
209     printOverviewLine(beg, end);
210     }
211    
212     /**
213     * Print one line of the match overview.
214     * @param beg the beginning of the match in screen columns
215     * @param end the end of the match in screen columns
216     */
217     private static void printOverviewLine(int beg, int end) {
218     for (int i = 1; i < beg; i++) {
219     System.out.print(" ");
220     }
221     for (int i = beg; i <= end; i++) {
222     System.out.print("-");
223     }
224     System.out.println();
225     }
226    
227     /**
228 duarte 622 * Returns the template id as concatenated pdbCode+pdbChainCode e.g. 1abcA
229 duarte 619 * @return the template id or null if queryId is not in the right format
230     */
231     public String getTemplateId() {
232     Pattern p = Pattern.compile(ID_REGEX);
233     Matcher m = p.matcher(subjectId);
234     if (m.matches()) {
235     return m.group(1).toLowerCase()+m.group(2).toUpperCase();
236     }
237     return null;
238     }
239    
240 duarte 618 }