1 |
#!/usr/bin/env python |
2 |
# encoding: utf-8 |
3 |
""" |
4 |
symmetricNN.py |
5 |
|
6 |
This is the implementation of the weighted symmetric nearest neighbour algorithms described in "Missing Value Imputation for Epistatic MAPs" |
7 |
|
8 |
Those only interested in understanding the algorithm should look at the performWNN function. |
9 |
|
10 |
Further information and updates to this implementation will be available at : http://www.bioinformatics.org/emapimputation |
11 |
|
12 |
For any queries please contact colm.ryan@ucd.ie |
13 |
|
14 |
Version : 1.1 |
15 |
|
16 |
$Rev:: $: Revision of last commit |
17 |
$Author:: $: Author of last commit |
18 |
$Date:: $: Date of last commit |
19 |
|
20 |
Copyright 2009 Colm Ryan |
21 |
|
22 |
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. |
23 |
You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 |
24 |
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, |
25 |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions |
26 |
and limitations under the License. |
27 |
|
28 |
""" |
29 |
|
30 |
import sys |
31 |
import getopt |
32 |
import csv |
33 |
import math |
34 |
import numpy as np |
35 |
import numpy.ma as ma |
36 |
|
37 |
DEFAULT_K = 20 |
38 |
MISSING = "" |
39 |
SEPARATOR = "\t" |
40 |
|
41 |
def get_ordered_tuple(item_a, item_b) : |
42 |
""" |
43 |
Takes two items as input & returns an ordered tuple as the result. |
44 |
|
45 |
Useful for interactions so that (A,B) & (B,A) are not treated seperately |
46 |
|
47 |
""" |
48 |
if cmp(item_a,item_b) <= 0: |
49 |
return (item_a, item_b) |
50 |
else : |
51 |
return (item_b, item_a) |
52 |
|
53 |
def read_square_dataset(file_location, missing_string, separator) : |
54 |
""" |
55 |
Reads in a square data matrix |
56 |
|
57 |
Returns a map of present interactions to scores, interaction profiles for each gene, |
58 |
and a list of names of genes present in the dataset. |
59 |
|
60 |
""" |
61 |
interactions = {} |
62 |
|
63 |
gene_profiles = {} |
64 |
csv.register_dialect('custom', delimiter=separator) |
65 |
lines = list(csv.reader(open(file_location, 'r'),'custom')) |
66 |
|
67 |
gene_list = lines[0] |
68 |
gene_count = len(gene_list) |
69 |
|
70 |
|
71 |
for x in range(1,gene_count) : |
72 |
gene_x = lines[x][0] |
73 |
|
74 |
for y in range(x+1,gene_count) : |
75 |
gene_y = gene_list[y] |
76 |
interaction = lines[x][y] |
77 |
if interaction != missing_string : |
78 |
score = float(interaction) |
79 |
xy = get_ordered_tuple(gene_x,gene_y) |
80 |
interactions[xy] = score |
81 |
|
82 |
if gene_x not in gene_profiles : |
83 |
gene_profiles[gene_x] = {} |
84 |
|
85 |
if gene_y not in gene_profiles : |
86 |
gene_profiles[gene_y] = {} |
87 |
|
88 |
gene_profiles[gene_y][gene_x] = score |
89 |
gene_profiles[gene_x][gene_y] = score |
90 |
|
91 |
gene_list = gene_list[1:] |
92 |
|
93 |
return [interactions, gene_profiles, gene_list] |
94 |
|
95 |
|
96 |
def LLSWrapper(data, k): |
97 |
""" |
98 |
Reads in a square data matrix, removes row and column headers |
99 |
and converts the matrix to numpy.array format. |
100 |
|
101 |
|
102 |
""" |
103 |
interactions = data[0] |
104 |
interaction_profiles = data[1] |
105 |
gene_list = data[2] |
106 |
|
107 |
gene_map = {} |
108 |
for i in range(len(gene_list)) : |
109 |
gene_map[gene_list[i]] = i |
110 |
|
111 |
imputed = {} |
112 |
matrix = [] |
113 |
counts = {} |
114 |
missing_values = set() |
115 |
|
116 |
#set missing values to 999.0 |
117 |
for geneI in gene_list : |
118 |
gene_count = 0 |
119 |
Iarray = [] |
120 |
for geneJ in gene_list : |
121 |
interaction = get_ordered_tuple(geneI,geneJ) |
122 |
if interaction in interactions : |
123 |
Iarray.append(interactions[interaction]) |
124 |
gene_count+=1 |
125 |
else : |
126 |
Iarray.append(999.0) |
127 |
missing_values.add(interaction) |
128 |
matrix.append(Iarray) |
129 |
|
130 |
filled = performLLS(np.asarray(matrix),999.0, k) |
131 |
|
132 |
for pair in missing_values: |
133 |
x = gene_map[pair[0]] |
134 |
y = gene_map[pair[1]] |
135 |
imputed[pair] = filled[x][y] |
136 |
|
137 |
return imputed |
138 |
|
139 |
def performLLS(input_matrix, missing_string, k) : |
140 |
""" |
141 |
Reads in a square data matrix |
142 |
|
143 |
Returns a numpy.ma array of interactions |
144 |
|
145 |
""" |
146 |
#Read data & create masked array |
147 |
masked_input = ma.masked_equal(input_matrix,missing_string) |
148 |
gene_count = len(input_matrix) |
149 |
missing_values = np.where(input_matrix==missing_string) |
150 |
|
151 |
#Get similarity matrix |
152 |
similarity = ma.corrcoef(masked_input) |
153 |
similarity = ma.filled(similarity,fill_value=0.0) |
154 |
|
155 |
#Find sorted neighbour list for each gene |
156 |
neighbours = {} |
157 |
for i in range(gene_count) : |
158 |
#get list of neighbours in reverse order |
159 |
neighbours[i] = np.argsort(np.absolute(similarity[i]))[::-1] |
160 |
#remove self from neighbour list |
161 |
neighbours[i] = np.delete(neighbours[i],np.where(neighbours[i]==i)) |
162 |
|
163 |
# Calculate average for each gene |
164 |
gene_averages = np.zeros(gene_count) |
165 |
for i in range(len(masked_input)) : |
166 |
gene = masked_input[i] |
167 |
mean = ma.mean(gene) |
168 |
if mean != mean : #if mean not a real number |
169 |
mean = 0 |
170 |
gene_averages[i] = mean |
171 |
|
172 |
# Initial estimate of missing value(row & column average) |
173 |
filled = input_matrix.copy() |
174 |
for i in range(len(missing_values[0])) : |
175 |
x = missing_values[0][i] |
176 |
y = missing_values[1][i] |
177 |
filled[x][y] = (gene_averages[x] + gene_averages[y]) / 2 |
178 |
|
179 |
estimates = {} |
180 |
unused_rows = [] |
181 |
for i in range(gene_count) : |
182 |
missing = np.where(input_matrix[i]==missing_string)[0] |
183 |
non_missing = np.where(input_matrix[i]!=missing_string)[0] |
184 |
if (len(missing) > 0) and (len(non_missing) > 0): |
185 |
estimates[i] = {} |
186 |
ineighbours = np.take(filled,neighbours[i][0:k],axis=0) |
187 |
A = np.take(ineighbours,non_missing,axis=1).T |
188 |
B = np.take(ineighbours,missing, axis=1).T |
189 |
w = np.take(input_matrix[i],non_missing) |
190 |
A_inv = np.linalg.pinv(A) |
191 |
X = np.dot(A_inv,w) |
192 |
values = np.dot(B,X) |
193 |
for j in range(len(missing)) : |
194 |
estimates[i][missing[j]] = values[j] |
195 |
else : |
196 |
unused_rows.append(i) |
197 |
|
198 |
for i in range(len(missing_values[0])) : |
199 |
x = missing_values[0][i] |
200 |
y = missing_values[1][i] |
201 |
if (x not in unused_rows) and (y not in unused_rows) : |
202 |
filled[x][y] = (estimates[x][y] + estimates[y][x]) / 2 |
203 |
|
204 |
#Self interactions are set to 0 |
205 |
for i in range(len(filled)) : |
206 |
filled[i][i] = 0 |
207 |
|
208 |
return filled |
209 |
|
210 |
def output_results(filename, separator, gene_list, scores,imputed) : |
211 |
""" |
212 |
Outputs a square data matrix |
213 |
|
214 |
""" |
215 |
f = open(filename,'w') |
216 |
|
217 |
for i in range(len(gene_list)) : |
218 |
f.write("%s%s" % (separator,gene_list[i])) |
219 |
|
220 |
f.write('\n') |
221 |
|
222 |
for i in range(len(gene_list)) : |
223 |
f.write("%s" % gene_list[i]) |
224 |
for j in range(len(gene_list)) : |
225 |
f.write(separator) |
226 |
interaction = get_ordered_tuple(gene_list[i],gene_list[j]) |
227 |
if interaction in scores : |
228 |
score = scores[interaction] |
229 |
|
230 |
elif interaction in imputed : |
231 |
score = imputed[interaction] |
232 |
else : # Only happens for self interactions(where geneI == geneJ) |
233 |
score = 0.0 |
234 |
f.write("%s" % score) |
235 |
f.write('\n') |
236 |
f.close() |
237 |
|
238 |
return 0 |
239 |
|
240 |
|
241 |
#Below is the executable part of the program |
242 |
|
243 |
help_message = ''' |
244 |
This is an implementation of the weighted local least squares algorithm, |
245 |
as described in "Missing Value Imputation for Epistatic MAPs" |
246 |
|
247 |
Requires Numpy - tested on version 1.4.0 |
248 |
|
249 |
Further information, and updates to this implementation will be available at : http://mlg.ucd.ie/emapimputation |
250 |
|
251 |
Required parameters are as follows: |
252 |
-i --input (filename): the E-MAP file to use as input |
253 |
-o --output (filename): the file to output the resulting complete matrix to |
254 |
|
255 |
Optional parameters as follows: |
256 |
-k --neighbours (number): the number of neighbours to use for imputation, defaults to 20 |
257 |
-s --separator (character): the separator used in the input file, defaults to tab |
258 |
-m --missing (string): the string to indicate a missing value, defaults to "" |
259 |
-h --help: displays this message |
260 |
|
261 |
''' |
262 |
|
263 |
class Usage(Exception): |
264 |
def __init__(self, msg): |
265 |
self.msg = msg |
266 |
|
267 |
def main(argv=None): |
268 |
k = DEFAULT_K |
269 |
weighted = True |
270 |
separator = SEPARATOR |
271 |
missing = MISSING |
272 |
|
273 |
if argv is None: |
274 |
argv = sys.argv |
275 |
try: |
276 |
try: |
277 |
opts, args = getopt.getopt(argv[1:], "hi:o:k:us:m:v", ["help","input=","output=","neighbours=","unweighted","separator=","missing="]) |
278 |
except getopt.error, msg: |
279 |
raise Usage(msg) |
280 |
|
281 |
# option processing |
282 |
for option, value in opts: |
283 |
if option == "-v": |
284 |
verbose = True |
285 |
elif option in ("-h", "--help"): |
286 |
raise Usage(help_message) |
287 |
elif option in ("-i", "--input"): |
288 |
input_file = value |
289 |
elif option in ("-o", "--output"): |
290 |
output = value |
291 |
elif option in ("-k", "--neighbours"): |
292 |
k = int(value) |
293 |
elif option in ("-u", "--unweighted"): |
294 |
weighted = False |
295 |
elif option in ("-s", "--separator"): |
296 |
separator = value |
297 |
elif option in ("-m", "--missing"): |
298 |
missing = value |
299 |
|
300 |
try : |
301 |
output |
302 |
input_file |
303 |
except NameError : |
304 |
print "Input and output filenames must be specified\n" |
305 |
raise Usage(help_message) |
306 |
else : |
307 |
#The runnable part of the program... |
308 |
data = read_square_dataset(input_file,missing,separator) |
309 |
imputed = LLSWrapper(data, k) |
310 |
output_results(output,separator,data[2],data[0],imputed) |
311 |
|
312 |
except Usage, err: |
313 |
print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) |
314 |
print >> sys.stderr, "\t for help use --help" |
315 |
return 2 |
316 |
|
317 |
|
318 |
return 0 |
319 |
|
320 |
if __name__ == "__main__": |
321 |
sys.exit(main()) |