ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/emapimputation/trunk/symmetricLLS.py
Revision: 6
Committed: Mon Sep 13 21:14:26 2010 UTC (14 years, 4 months ago) by colmryan
File size: 10199 byte(s)
Log Message:
Adding versioning information
Line File contents
1 #!/usr/bin/env python
2 # encoding: utf-8
3 """
4 symmetricNN.py
5
6 This is the implementation of the weighted symmetric nearest neighbour algorithms described in "Missing Value Imputation for Epistatic MAPs"
7
8 Those only interested in understanding the algorithm should look at the performWNN function.
9
10 Further information and updates to this implementation will be available at : http://www.bioinformatics.org/emapimputation
11
12 For any queries please contact colm.ryan@ucd.ie
13
14 Version : 1.1
15
16 $Rev:: $: Revision of last commit
17 $Author:: $: Author of last commit
18 $Date:: $: Date of last commit
19
20 Copyright 2009 Colm Ryan
21
22 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
23 You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
24 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
25 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
26 and limitations under the License.
27
28 """
29
30 import sys
31 import getopt
32 import csv
33 import math
34 import numpy as np
35 import numpy.ma as ma
36
37 DEFAULT_K = 20
38 MISSING = ""
39 SEPARATOR = "\t"
40
41 def get_ordered_tuple(item_a, item_b) :
42 """
43 Takes two items as input & returns an ordered tuple as the result.
44
45 Useful for interactions so that (A,B) & (B,A) are not treated seperately
46
47 """
48 if cmp(item_a,item_b) <= 0:
49 return (item_a, item_b)
50 else :
51 return (item_b, item_a)
52
53 def read_square_dataset(file_location, missing_string, separator) :
54 """
55 Reads in a square data matrix
56
57 Returns a map of present interactions to scores, interaction profiles for each gene,
58 and a list of names of genes present in the dataset.
59
60 """
61 interactions = {}
62
63 gene_profiles = {}
64 csv.register_dialect('custom', delimiter=separator)
65 lines = list(csv.reader(open(file_location, 'r'),'custom'))
66
67 gene_list = lines[0]
68 gene_count = len(gene_list)
69
70
71 for x in range(1,gene_count) :
72 gene_x = lines[x][0]
73
74 for y in range(x+1,gene_count) :
75 gene_y = gene_list[y]
76 interaction = lines[x][y]
77 if interaction != missing_string :
78 score = float(interaction)
79 xy = get_ordered_tuple(gene_x,gene_y)
80 interactions[xy] = score
81
82 if gene_x not in gene_profiles :
83 gene_profiles[gene_x] = {}
84
85 if gene_y not in gene_profiles :
86 gene_profiles[gene_y] = {}
87
88 gene_profiles[gene_y][gene_x] = score
89 gene_profiles[gene_x][gene_y] = score
90
91 gene_list = gene_list[1:]
92
93 return [interactions, gene_profiles, gene_list]
94
95
96 def LLSWrapper(data, k):
97 """
98 Reads in a square data matrix, removes row and column headers
99 and converts the matrix to numpy.array format.
100
101
102 """
103 interactions = data[0]
104 interaction_profiles = data[1]
105 gene_list = data[2]
106
107 gene_map = {}
108 for i in range(len(gene_list)) :
109 gene_map[gene_list[i]] = i
110
111 imputed = {}
112 matrix = []
113 counts = {}
114 missing_values = set()
115
116 #set missing values to 999.0
117 for geneI in gene_list :
118 gene_count = 0
119 Iarray = []
120 for geneJ in gene_list :
121 interaction = get_ordered_tuple(geneI,geneJ)
122 if interaction in interactions :
123 Iarray.append(interactions[interaction])
124 gene_count+=1
125 else :
126 Iarray.append(999.0)
127 missing_values.add(interaction)
128 matrix.append(Iarray)
129
130 filled = performLLS(np.asarray(matrix),999.0, k)
131
132 for pair in missing_values:
133 x = gene_map[pair[0]]
134 y = gene_map[pair[1]]
135 imputed[pair] = filled[x][y]
136
137 return imputed
138
139 def performLLS(input_matrix, missing_string, k) :
140 """
141 Reads in a square data matrix
142
143 Returns a numpy.ma array of interactions
144
145 """
146 #Read data & create masked array
147 masked_input = ma.masked_equal(input_matrix,missing_string)
148 gene_count = len(input_matrix)
149 missing_values = np.where(input_matrix==missing_string)
150
151 #Get similarity matrix
152 similarity = ma.corrcoef(masked_input)
153 similarity = ma.filled(similarity,fill_value=0.0)
154
155 #Find sorted neighbour list for each gene
156 neighbours = {}
157 for i in range(gene_count) :
158 #get list of neighbours in reverse order
159 neighbours[i] = np.argsort(np.absolute(similarity[i]))[::-1]
160 #remove self from neighbour list
161 neighbours[i] = np.delete(neighbours[i],np.where(neighbours[i]==i))
162
163 # Calculate average for each gene
164 gene_averages = np.zeros(gene_count)
165 for i in range(len(masked_input)) :
166 gene = masked_input[i]
167 mean = ma.mean(gene)
168 if mean != mean : #if mean not a real number
169 mean = 0
170 gene_averages[i] = mean
171
172 # Initial estimate of missing value(row & column average)
173 filled = input_matrix.copy()
174 for i in range(len(missing_values[0])) :
175 x = missing_values[0][i]
176 y = missing_values[1][i]
177 filled[x][y] = (gene_averages[x] + gene_averages[y]) / 2
178
179 estimates = {}
180 unused_rows = []
181 for i in range(gene_count) :
182 missing = np.where(input_matrix[i]==missing_string)[0]
183 non_missing = np.where(input_matrix[i]!=missing_string)[0]
184 if (len(missing) > 0) and (len(non_missing) > 0):
185 estimates[i] = {}
186 ineighbours = np.take(filled,neighbours[i][0:k],axis=0)
187 A = np.take(ineighbours,non_missing,axis=1).T
188 B = np.take(ineighbours,missing, axis=1).T
189 w = np.take(input_matrix[i],non_missing)
190 A_inv = np.linalg.pinv(A)
191 X = np.dot(A_inv,w)
192 values = np.dot(B,X)
193 for j in range(len(missing)) :
194 estimates[i][missing[j]] = values[j]
195 else :
196 unused_rows.append(i)
197
198 for i in range(len(missing_values[0])) :
199 x = missing_values[0][i]
200 y = missing_values[1][i]
201 if (x not in unused_rows) and (y not in unused_rows) :
202 filled[x][y] = (estimates[x][y] + estimates[y][x]) / 2
203
204 #Self interactions are set to 0
205 for i in range(len(filled)) :
206 filled[i][i] = 0
207
208 return filled
209
210 def output_results(filename, separator, gene_list, scores,imputed) :
211 """
212 Outputs a square data matrix
213
214 """
215 f = open(filename,'w')
216
217 for i in range(len(gene_list)) :
218 f.write("%s%s" % (separator,gene_list[i]))
219
220 f.write('\n')
221
222 for i in range(len(gene_list)) :
223 f.write("%s" % gene_list[i])
224 for j in range(len(gene_list)) :
225 f.write(separator)
226 interaction = get_ordered_tuple(gene_list[i],gene_list[j])
227 if interaction in scores :
228 score = scores[interaction]
229
230 elif interaction in imputed :
231 score = imputed[interaction]
232 else : # Only happens for self interactions(where geneI == geneJ)
233 score = 0.0
234 f.write("%s" % score)
235 f.write('\n')
236 f.close()
237
238 return 0
239
240
241 #Below is the executable part of the program
242
243 help_message = '''
244 This is an implementation of the weighted local least squares algorithm,
245 as described in "Missing Value Imputation for Epistatic MAPs"
246
247 Requires Numpy - tested on version 1.4.0
248
249 Further information, and updates to this implementation will be available at : http://mlg.ucd.ie/emapimputation
250
251 Required parameters are as follows:
252 -i --input (filename): the E-MAP file to use as input
253 -o --output (filename): the file to output the resulting complete matrix to
254
255 Optional parameters as follows:
256 -k --neighbours (number): the number of neighbours to use for imputation, defaults to 20
257 -s --separator (character): the separator used in the input file, defaults to tab
258 -m --missing (string): the string to indicate a missing value, defaults to ""
259 -h --help: displays this message
260
261 '''
262
263 class Usage(Exception):
264 def __init__(self, msg):
265 self.msg = msg
266
267 def main(argv=None):
268 k = DEFAULT_K
269 weighted = True
270 separator = SEPARATOR
271 missing = MISSING
272
273 if argv is None:
274 argv = sys.argv
275 try:
276 try:
277 opts, args = getopt.getopt(argv[1:], "hi:o:k:us:m:v", ["help","input=","output=","neighbours=","unweighted","separator=","missing="])
278 except getopt.error, msg:
279 raise Usage(msg)
280
281 # option processing
282 for option, value in opts:
283 if option == "-v":
284 verbose = True
285 elif option in ("-h", "--help"):
286 raise Usage(help_message)
287 elif option in ("-i", "--input"):
288 input_file = value
289 elif option in ("-o", "--output"):
290 output = value
291 elif option in ("-k", "--neighbours"):
292 k = int(value)
293 elif option in ("-u", "--unweighted"):
294 weighted = False
295 elif option in ("-s", "--separator"):
296 separator = value
297 elif option in ("-m", "--missing"):
298 missing = value
299
300 try :
301 output
302 input_file
303 except NameError :
304 print "Input and output filenames must be specified\n"
305 raise Usage(help_message)
306 else :
307 #The runnable part of the program...
308 data = read_square_dataset(input_file,missing,separator)
309 imputed = LLSWrapper(data, k)
310 output_results(output,separator,data[2],data[0],imputed)
311
312 except Usage, err:
313 print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
314 print >> sys.stderr, "\t for help use --help"
315 return 2
316
317
318 return 0
319
320 if __name__ == "__main__":
321 sys.exit(main())

Properties

Name Value
svn:keywords Revision Author Date