-
Notifications
You must be signed in to change notification settings - Fork 3
/
kernels.py
217 lines (176 loc) · 6.74 KB
/
kernels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import numpy as np
import scipy.sparse as sparse
from tqdm import tqdm
class Kernel():
""" Abstract Kernel class"""
def __init__(self):
pass
def similarity(self, x, y):
""" Similarity between 2 feature vectors (depends on the type of kernel)"""
return -1
def gram(self, X1, X2=None):
""" Compute the gram matrix of a data vector X where the (i,j) entry is defined as <Xi,Xj>\\
X1: data vector (n_samples_1 x n_features)
X2: data vector (n_samples_2 x n_features), if None compute the gram matrix for (X1,X1)
"""
if X2 is None:
X2=X1
n_samples_1 = X1.shape[0]
n_samples_2 = X2.shape[0]
G = np.zeros((n_samples_1, n_samples_2))
for ii in tqdm(range(n_samples_1)):
for jj in range(n_samples_2):
G[ii,jj] = self.similarity(X1[ii], X2[jj])
return G
class SumKernel(Kernel):
def __init__(self, kernels, weights=None):
""" kernels: list of kernels """
self.kernels = kernels
self.weights = weights
if self.weights is None:
self.weights = [1.0 for _ in kernels]
super().__init__()
def similarity(self, x, y):
""" x, y: string """
s = self.kernels[0].similarity(x,y) * self.weights[0]
for ii, kernel in enumerate(self.kernels[1:]):
s += kernel.similarity(x,y) * self.weights[ii]
return s
def gram(self, X1, X2=None):
""" Compute the sum of the gram matrices of all kernels\\
X1: array of string (n_samples_1,)
X2: array of string (n_samples_2,), if None compute the gram matrix for (X1,X1)
"""
G = self.kernels[0].gram(X1,X2) * self.weights[0]
for ii, kernel in tqdm(enumerate(self.kernels[1:])):
G += kernel.gram(X1,X2) * self.weights[ii]
return G
class LinearKernel(Kernel):
def __init__(self):
super().__init__()
def similarity(self, x, y):
""" linear kernel : k(x,y) = <x,y> \\
x, y: array (n_features,)
"""
return np.dot(x,y)
class GaussianKernel(Kernel):
def __init__(self, sigma,normalize=True):
super().__init__()
self.sigma = sigma
self.normalize = normalize
def similarity(self, x, y):
""" gaussian kernel : k(x,y) = 1/ sqrt(2 pi sigma2)^n * exp( - ||x-y||^2 / 2 sigma^2 )\\
x, y: array (n_features,)
"""
if self.normalize:
norm_fact = (np.sqrt(2 * np.pi) * self.sigma) ** len(x)
return np.exp(-np.linalg.norm(x-y)**2 / (2 * self.sigma**2)) / norm_fact
else:
return np.exp(-np.linalg.norm(x-y)**2 / (2 * self.sigma**2))
class PolynomialKernel(Kernel):
def __init__(self, gamma=1, coef0=1, degree=3):
super().__init__()
self.gamma = gamma
self.coef0 = coef0
self.degree = degree
def similarity(self, x, y):
""" polynomial kernel : k(x,y) = (gamma <x,y> + r)^d \\
x, y: array (n_features,)
"""
return (self.gamma * np.dot(x,y) + self.coef0)**self.degree
class SpectrumKernel(Kernel):
def __init__(self, k):
super().__init__()
self.k = k
def similarity(self, x, y):
""" Spectrum kernel \\
x, y: string
"""
substr_x, counts_x = np.unique([x[i:i+self.k] for i in range(len(x)-self.k+1)], return_counts=True)
return np.sum(np.char.count(y, substr_x)*counts_x)
class MismatchKernel(Kernel):
def __init__(self, k, m, neighbours, kmer_set, normalize=False):
super().__init__()
self.k = k
self.m = m
self.kmer_set = kmer_set #kmer_set and neighbours have to be pre-computed (to save computational time when running multiple experiments)
self.neighbours = neighbours
self.normalize = normalize
def neighbour_embed_kmer(self, x):
"""
Embed kmer with neighbours.
x: str
"""
kmer_x = [x[j:j + self.k] for j in range(len(x) - self.k + 1)]
x_emb = {}
for kmer in kmer_x:
neigh_kmer = self.neighbours[kmer]
for neigh in neigh_kmer:
idx_neigh = self.kmer_set[neigh]
if idx_neigh in x_emb:
x_emb[idx_neigh] += 1
else:
x_emb[idx_neigh] = 1
return x_emb
def neighbour_embed_data(self, X):
"""
Embed data with neighbours.
X: array of string
"""
X_emb = []
for i in range(len(X)):
x = X[i]
x_emb = self.neighbour_embed_kmer(x)
X_emb.append(x_emb)
return X_emb
def to_sparse(self, X_emb):
"""
Embed data to sparse matrix.
X_emb: list of dict.
"""
data, row, col = [], [], []
for i in range(len(X_emb)):
x = X_emb[i]
data += list(x.values())
row += list(x.keys())
col += [i for j in range(len(x))]
X_sm = sparse.coo_matrix((data, (row, col)))
return X_sm
def similarity(self, x, y):
""" Mismatch kernel \\
x, y: string
"""
x_emb = self.neighbour_embed_kmer(x)
y_emb = self.neighbour_embed_kmer(y)
sp = 0
for idx_neigh in x_emb:
if idx_neigh in y_emb:
sp += x_emb[idx_neigh] * y_emb[idx_neigh]
if self.normalize:
sp /= np.sqrt(np.sum(np.array(list(x_emb.values()))**2))
sp /= np.sqrt(np.sum(np.array(list(y_emb.values()))**2))
return sp
def gram(self, X1, X2=None):
""" Compute the gram matrix of a data vector X where the (i,j) entry is defined as <Xi,Xj>\\
X1: array of string (n_samples_1,)
X2: array of string (n_samples_2,), if None compute the gram matrix for (X1,X1)
"""
X1_emb = self.neighbour_embed_data(X1)
X1_sm = self.to_sparse(X1_emb)
if X2 is None:
X2 = X1
X2_emb = self.neighbour_embed_data(X2)
X2_sm = self.to_sparse(X2_emb)
# Reshape matrices if the sizes are different
nadd_row = abs(X1_sm.shape[0] - X2_sm.shape[0])
if X1_sm.shape[0] > X2_sm.shape[0]:
add_row = sparse.coo_matrix(([0], ([nadd_row-1], [X2_sm.shape[1]-1])))
X2_sm = sparse.vstack((X2_sm, add_row))
elif X1_sm.shape[0] < X2_sm.shape[0]:
add_row = sparse.coo_matrix(([0], ([nadd_row - 1], [X1_sm.shape[1] - 1])))
X1_sm = sparse.vstack((X1_sm, add_row))
G = (X1_sm.T * X2_sm).todense().astype('float')
if self.normalize:
G /= np.array(np.sqrt(X1_sm.power(2).sum(0)))[0,:,None]
G /= np.array(np.sqrt(X2_sm.power(2).sum(0)))[0,None,:]
return G