emapimputation/trunk/symmetricLLS.py

#!/usr/bin/env python
# encoding: utf-8
"""
symmetricNN.py

This is the implementation of the weighted symmetric nearest neighbour algorithms described in "Missing Value Imputation for Epistatic MAPs"

Those only interested in understanding the algorithm should look at the performWNN function.

Further information and updates to this implementation will be available at : http://www.bioinformatics.org/emapimputation

For any queries please contact colm.ryan@ucd.ie

Version : 1.1 

$Rev::               $:  Revision of last commit
$Author::            $:  Author of last commit
$Date::              $:  Date of last commit

Copyright 2009 Colm Ryan 

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions 
and limitations under the License. 

"""

import sys
import getopt
import csv
import math
import numpy as np
import numpy.ma as ma

DEFAULT_K = 20
MISSING = ""
SEPARATOR = "\t"

def get_ordered_tuple(item_a, item_b) :
    """
    Takes two items as input & returns an ordered tuple as the result.
    
    Useful for interactions so that (A,B) & (B,A) are not treated seperately
    
    """
    if cmp(item_a,item_b) <= 0:
        return (item_a, item_b)
    else :
        return (item_b, item_a)

def read_square_dataset(file_location, missing_string, separator) :
    """
    Reads in a square data matrix
    
    Returns a map of present interactions to scores, interaction profiles for each gene,
    and a list of names of genes present in the dataset.
    
    """
    interactions = {}
    
    gene_profiles = {}
    csv.register_dialect('custom', delimiter=separator)
    lines = list(csv.reader(open(file_location, 'r'),'custom'))
    
    gene_list = lines[0]
    gene_count = len(gene_list)
    
    
    for x in range(1,gene_count) :      
        gene_x = lines[x][0]
        
        for y in range(x+1,gene_count) :
            gene_y = gene_list[y]
            interaction = lines[x][y]
            if interaction != missing_string :
                score = float(interaction)
                xy = get_ordered_tuple(gene_x,gene_y)
                interactions[xy] = score
                
                if gene_x not in gene_profiles :
                    gene_profiles[gene_x] = {}

                if gene_y not in gene_profiles :
                    gene_profiles[gene_y] = {}

                gene_profiles[gene_y][gene_x] = score
                gene_profiles[gene_x][gene_y] = score

    gene_list = gene_list[1:]
    
    return [interactions, gene_profiles, gene_list]


def LLSWrapper(data, k):
    """
    Reads in a square data matrix, removes row and column headers 
    and converts the matrix to numpy.array format.
    

    """
    interactions = data[0]
    interaction_profiles = data[1]
    gene_list = data[2]
    
    gene_map = {}
    for i in range(len(gene_list)) :
        gene_map[gene_list[i]] = i
        
    imputed = {}
    matrix = []
    counts = {}
    missing_values = set()
    
    #set missing values to 999.0
    for geneI in gene_list :
        gene_count = 0
        Iarray = []
        for geneJ in gene_list :
            interaction = get_ordered_tuple(geneI,geneJ)
            if interaction in interactions :
                Iarray.append(interactions[interaction])
                gene_count+=1
            else :
                Iarray.append(999.0)
                missing_values.add(interaction)   
        matrix.append(Iarray)
    
    filled = performLLS(np.asarray(matrix),999.0, k)
    
    for pair in missing_values:
        x = gene_map[pair[0]]
        y = gene_map[pair[1]]
        imputed[pair] = filled[x][y]
    
    return imputed    

def performLLS(input_matrix, missing_string, k) :
    """
    Reads in a square data matrix

    Returns a numpy.ma array of interactions

    """
    #Read data & create masked array
    masked_input = ma.masked_equal(input_matrix,missing_string)
    gene_count = len(input_matrix)
    missing_values = np.where(input_matrix==missing_string)

    #Get similarity matrix
    similarity = ma.corrcoef(masked_input)
    similarity = ma.filled(similarity,fill_value=0.0)

    #Find sorted neighbour list for each gene
    neighbours = {}
    for i in range(gene_count) :
        #get list of neighbours in reverse order
        neighbours[i] = np.argsort(np.absolute(similarity[i]))[::-1]
        #remove self from neighbour list
        neighbours[i] = np.delete(neighbours[i],np.where(neighbours[i]==i))

    # Calculate average for each gene
    gene_averages = np.zeros(gene_count)
    for i in range(len(masked_input)) :
        gene = masked_input[i]
        mean = ma.mean(gene)
        if mean != mean : #if mean not a real number
            mean = 0
        gene_averages[i] = mean

    # Initial estimate of missing value(row & column average)
    filled = input_matrix.copy()    
    for i in range(len(missing_values[0])) :
        x = missing_values[0][i]
        y = missing_values[1][i]
        filled[x][y] = (gene_averages[x] + gene_averages[y]) / 2

    estimates = {}
    unused_rows = []
    for i in range(gene_count) :
        missing = np.where(input_matrix[i]==missing_string)[0]
        non_missing = np.where(input_matrix[i]!=missing_string)[0]
        if (len(missing) > 0) and (len(non_missing) > 0):
            estimates[i] = {}
            ineighbours = np.take(filled,neighbours[i][0:k],axis=0)
            A = np.take(ineighbours,non_missing,axis=1).T
            B = np.take(ineighbours,missing, axis=1).T
            w = np.take(input_matrix[i],non_missing)
            A_inv = np.linalg.pinv(A)
            X = np.dot(A_inv,w)
            values = np.dot(B,X)
            for j in range(len(missing)) :
                estimates[i][missing[j]] = values[j]
        else :
             unused_rows.append(i)

    for i in range(len(missing_values[0])) :
        x = missing_values[0][i]
        y = missing_values[1][i]
        if (x not in unused_rows) and (y not in unused_rows) :
            filled[x][y] = (estimates[x][y] + estimates[y][x]) / 2

    #Self interactions are set to 0    
    for i in range(len(filled)) :
        filled[i][i] = 0   

    return filled    

def output_results(filename, separator, gene_list, scores,imputed) :
    """
    Outputs a square data matrix
    
    """    
    f = open(filename,'w')
    
    for i in range(len(gene_list)) :
        f.write("%s%s" % (separator,gene_list[i]))

    f.write('\n')   

    for i in range(len(gene_list)) :
        f.write("%s" % gene_list[i])
        for j in range(len(gene_list)) :
            f.write(separator)
            interaction = get_ordered_tuple(gene_list[i],gene_list[j])
            if interaction in scores :
                score = scores[interaction]             
                
            elif interaction in imputed :
                score = imputed[interaction] 
            else : # Only happens for self interactions(where geneI == geneJ)
                score = 0.0    
            f.write("%s" % score)    
        f.write('\n')   
    f.close()

    return 0


#Below is the executable part of the program
    
help_message = '''
This is an implementation of the weighted local least squares algorithm,
as described in "Missing Value Imputation for Epistatic MAPs"

Requires Numpy - tested on version 1.4.0

Further information, and updates to this implementation will be available at : http://mlg.ucd.ie/emapimputation

Required parameters are as follows:
-i --input (filename): the E-MAP file to use as input
-o --output (filename): the file to output the resulting complete matrix to

Optional parameters as follows:
-k --neighbours (number): the number of neighbours to use for imputation, defaults to 20
-s --separator (character): the separator used in the input file, defaults to tab
-m --missing (string): the string to indicate a missing value, defaults to ""
-h --help: displays this message  

'''

class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg

def main(argv=None):
    k = DEFAULT_K
    weighted = True
    separator = SEPARATOR
    missing = MISSING
    
    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(argv[1:], "hi:o:k:us:m:v", ["help","input=","output=","neighbours=","unweighted","separator=","missing="])
        except getopt.error, msg:
            raise Usage(msg)
    
        # option processing
        for option, value in opts:
            if option == "-v":
                verbose = True
            elif option in ("-h", "--help"):
                raise Usage(help_message)
            elif option in ("-i", "--input"):
                input_file = value
            elif option in ("-o", "--output"):
                output = value
            elif option in ("-k", "--neighbours"):
                k = int(value)
            elif option in ("-u", "--unweighted"):
                weighted = False
            elif option in ("-s", "--separator"):
                separator = value
            elif option in ("-m", "--missing"):
                missing = value   
        
        try :
            output
            input_file
        except NameError :
            print "Input and output filenames must be specified\n"
            raise Usage(help_message)
        else :
            #The runnable part of the program...
            data = read_square_dataset(input_file,missing,separator)
            imputed = LLSWrapper(data, k)
            output_results(output,separator,data[2],data[0],imputed)
            
    except Usage, err:
        print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
        print >> sys.stderr, "\t for help use --help"
        return 2              
    
    
    return 0

if __name__ == "__main__":
    sys.exit(main())
Revision:	6
Committed:	Mon Sep 13 21:14:26 2010 UTC (14 years, 4 months ago) by colmryan
File size:	10199 byte(s)
Log Message:	Adding versioning information
Line	File contents
1	#!/usr/bin/env python
2	# encoding: utf-8
3	"""
4	symmetricNN.py
5
6	This is the implementation of the weighted symmetric nearest neighbour algorithms described in "Missing Value Imputation for Epistatic MAPs"
7
8	Those only interested in understanding the algorithm should look at the performWNN function.
9
10	Further information and updates to this implementation will be available at : http://www.bioinformatics.org/emapimputation
11
12	For any queries please contact colm.ryan@ucd.ie
13
14	Version : 1.1
15
16	$Rev:: $: Revision of last commit
17	$Author:: $: Author of last commit
18	$Date:: $: Date of last commit
19
20	Copyright 2009 Colm Ryan
21
22	Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
23	You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
24	Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
25	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
26	and limitations under the License.
27
28	"""
29
30	import sys
31	import getopt
32	import csv
33	import math
34	import numpy as np
35	import numpy.ma as ma
36
37	DEFAULT_K = 20
38	MISSING = ""
39	SEPARATOR = "\t"
40
41	def get_ordered_tuple(item_a, item_b) :
42	"""
43	Takes two items as input & returns an ordered tuple as the result.
44
45	Useful for interactions so that (A,B) & (B,A) are not treated seperately
46
47	"""
48	if cmp(item_a,item_b) <= 0:
49	return (item_a, item_b)
50	else :
51	return (item_b, item_a)
52
53	def read_square_dataset(file_location, missing_string, separator) :
54	"""
55	Reads in a square data matrix
56
57	Returns a map of present interactions to scores, interaction profiles for each gene,
58	and a list of names of genes present in the dataset.
59
60	"""
61	interactions = {}
62
63	gene_profiles = {}
64	csv.register_dialect('custom', delimiter=separator)
65	lines = list(csv.reader(open(file_location, 'r'),'custom'))
66
67	gene_list = lines[0]
68	gene_count = len(gene_list)
69
70
71	for x in range(1,gene_count) :
72	gene_x = lines[x][0]
73
74	for y in range(x+1,gene_count) :
75	gene_y = gene_list[y]
76	interaction = lines[x][y]
77	if interaction != missing_string :
78	score = float(interaction)
79	xy = get_ordered_tuple(gene_x,gene_y)
80	interactions[xy] = score
81
82	if gene_x not in gene_profiles :
83	gene_profiles[gene_x] = {}
84
85	if gene_y not in gene_profiles :
86	gene_profiles[gene_y] = {}
87
88	gene_profiles[gene_y][gene_x] = score
89	gene_profiles[gene_x][gene_y] = score
90
91	gene_list = gene_list[1:]
92
93	return [interactions, gene_profiles, gene_list]
94
95
96	def LLSWrapper(data, k):
97	"""
98	Reads in a square data matrix, removes row and column headers
99	and converts the matrix to numpy.array format.
100
101
102	"""
103	interactions = data[0]
104	interaction_profiles = data[1]
105	gene_list = data[2]
106
107	gene_map = {}
108	for i in range(len(gene_list)) :
109	gene_map[gene_list[i]] = i
110
111	imputed = {}
112	matrix = []
113	counts = {}
114	missing_values = set()
115
116	#set missing values to 999.0
117	for geneI in gene_list :
118	gene_count = 0
119	Iarray = []
120	for geneJ in gene_list :
121	interaction = get_ordered_tuple(geneI,geneJ)
122	if interaction in interactions :
123	Iarray.append(interactions[interaction])
124	gene_count+=1
125	else :
126	Iarray.append(999.0)
127	missing_values.add(interaction)
128	matrix.append(Iarray)
129
130	filled = performLLS(np.asarray(matrix),999.0, k)
131
132	for pair in missing_values:
133	x = gene_map[pair[0]]
134	y = gene_map[pair[1]]
135	imputed[pair] = filled[x][y]
136
137	return imputed
138
139	def performLLS(input_matrix, missing_string, k) :
140	"""
141	Reads in a square data matrix
142
143	Returns a numpy.ma array of interactions
144
145	"""
146	#Read data & create masked array
147	masked_input = ma.masked_equal(input_matrix,missing_string)
148	gene_count = len(input_matrix)
149	missing_values = np.where(input_matrix==missing_string)
150
151	#Get similarity matrix
152	similarity = ma.corrcoef(masked_input)
153	similarity = ma.filled(similarity,fill_value=0.0)
154
155	#Find sorted neighbour list for each gene
156	neighbours = {}
157	for i in range(gene_count) :
158	#get list of neighbours in reverse order
159	neighbours[i] = np.argsort(np.absolute(similarity[i]))[::-1]
160	#remove self from neighbour list
161	neighbours[i] = np.delete(neighbours[i],np.where(neighbours[i]==i))
162
163	# Calculate average for each gene
164	gene_averages = np.zeros(gene_count)
165	for i in range(len(masked_input)) :
166	gene = masked_input[i]
167	mean = ma.mean(gene)
168	if mean != mean : #if mean not a real number
169	mean = 0
170	gene_averages[i] = mean
171
172	# Initial estimate of missing value(row & column average)
173	filled = input_matrix.copy()
174	for i in range(len(missing_values[0])) :
175	x = missing_values[0][i]
176	y = missing_values[1][i]
177	filled[x][y] = (gene_averages[x] + gene_averages[y]) / 2
178
179	estimates = {}
180	unused_rows = []
181	for i in range(gene_count) :
182	missing = np.where(input_matrix[i]==missing_string)[0]
183	non_missing = np.where(input_matrix[i]!=missing_string)[0]
184	if (len(missing) > 0) and (len(non_missing) > 0):
185	estimates[i] = {}
186	ineighbours = np.take(filled,neighbours[i][0:k],axis=0)
187	A = np.take(ineighbours,non_missing,axis=1).T
188	B = np.take(ineighbours,missing, axis=1).T
189	w = np.take(input_matrix[i],non_missing)
190	A_inv = np.linalg.pinv(A)
191	X = np.dot(A_inv,w)
192	values = np.dot(B,X)
193	for j in range(len(missing)) :
194	estimates[i][missing[j]] = values[j]
195	else :
196	unused_rows.append(i)
197
198	for i in range(len(missing_values[0])) :
199	x = missing_values[0][i]
200	y = missing_values[1][i]
201	if (x not in unused_rows) and (y not in unused_rows) :
202	filled[x][y] = (estimates[x][y] + estimates[y][x]) / 2
203
204	#Self interactions are set to 0
205	for i in range(len(filled)) :
206	filled[i][i] = 0
207
208	return filled
209
210	def output_results(filename, separator, gene_list, scores,imputed) :
211	"""
212	Outputs a square data matrix
213
214	"""
215	f = open(filename,'w')
216
217	for i in range(len(gene_list)) :
218	f.write("%s%s" % (separator,gene_list[i]))
219
220	f.write('\n')
221
222	for i in range(len(gene_list)) :
223	f.write("%s" % gene_list[i])
224	for j in range(len(gene_list)) :
225	f.write(separator)
226	interaction = get_ordered_tuple(gene_list[i],gene_list[j])
227	if interaction in scores :
228	score = scores[interaction]
229
230	elif interaction in imputed :
231	score = imputed[interaction]
232	else : # Only happens for self interactions(where geneI == geneJ)
233	score = 0.0
234	f.write("%s" % score)
235	f.write('\n')
236	f.close()
237
238	return 0
239
240
241	#Below is the executable part of the program
242
243	help_message = '''
244	This is an implementation of the weighted local least squares algorithm,
245	as described in "Missing Value Imputation for Epistatic MAPs"
246
247	Requires Numpy - tested on version 1.4.0
248
249	Further information, and updates to this implementation will be available at : http://mlg.ucd.ie/emapimputation
250
251	Required parameters are as follows:
252	-i --input (filename): the E-MAP file to use as input
253	-o --output (filename): the file to output the resulting complete matrix to
254
255	Optional parameters as follows:
256	-k --neighbours (number): the number of neighbours to use for imputation, defaults to 20
257	-s --separator (character): the separator used in the input file, defaults to tab
258	-m --missing (string): the string to indicate a missing value, defaults to ""
259	-h --help: displays this message
260
261	'''
262
263	class Usage(Exception):
264	def __init__(self, msg):
265	self.msg = msg
266
267	def main(argv=None):
268	k = DEFAULT_K
269	weighted = True
270	separator = SEPARATOR
271	missing = MISSING
272
273	if argv is None:
274	argv = sys.argv
275	try:
276	try:
277	opts, args = getopt.getopt(argv[1:], "hi:o:k:us:m:v", ["help","input=","output=","neighbours=","unweighted","separator=","missing="])
278	except getopt.error, msg:
279	raise Usage(msg)
280
281	# option processing
282	for option, value in opts:
283	if option == "-v":
284	verbose = True
285	elif option in ("-h", "--help"):
286	raise Usage(help_message)
287	elif option in ("-i", "--input"):
288	input_file = value
289	elif option in ("-o", "--output"):
290	output = value
291	elif option in ("-k", "--neighbours"):
292	k = int(value)
293	elif option in ("-u", "--unweighted"):
294	weighted = False
295	elif option in ("-s", "--separator"):
296	separator = value
297	elif option in ("-m", "--missing"):
298	missing = value
299
300	try :
301	output
302	input_file
303	except NameError :
304	print "Input and output filenames must be specified\n"
305	raise Usage(help_message)
306	else :
307	#The runnable part of the program...
308	data = read_square_dataset(input_file,missing,separator)
309	imputed = LLSWrapper(data, k)
310	output_results(output,separator,data[2],data[0],imputed)
311
312	except Usage, err:
313	print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
314	print >> sys.stderr, "\t for help use --help"
315	return 2
316
317
318	return 0
319
320	if __name__ == "__main__":
321	sys.exit(main())