-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathGDR.py
346 lines (253 loc) · 13.9 KB
/
GDR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# This file is part of GDR.
#
# Copyright (C) 2019, Alexis Arnaudon (alexis.arnaudon@imperial.ac.uk),
# Robert Peach (r.peach13@imperial.ac.uk)
# https://github.com/barahona-research-group/GDR
#
# GDR is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# GDR is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GDR. If not, see <http://www.gnu.org/licenses/>.
#
import numpy as np
import scipy as sc
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from utils import *
class GDR(object):
"""
Class to compute the class for nodes given a semi-supervised graph dataset
Inputs:
dataset:
String that describes the dataset e.g. 'cora','citeseer','pubmed',
'wikipedia'
tpe_rw:
String that describes the type of Laplacian
'Combinatorial' or 'Normalised'
"""
def __init__(self, dataset, tpe_rw):
self.dataset = dataset # string of dataset
self.tpe_rw = tpe_rw #type of random walk: Combinatoria, Normalised, Discrete
# Data to be loaded with load_ dataset
self.A = [] #adjacency matrix
self.train_data = []
self.test_data = []
self.val_data = []
self.feature_data = []
self.feature_data_orig = []
self.directed = False
# Prior distribution to be calculated with prior_distribution
self.similarity = [] #collect the similarity vectors after preconditioning
self.similarity_orig = [] #collec the original similarity vectors
def load_dataset(self):
"""
Extracting data from datasets and loading to the class
"""
if self.dataset == 'cora-dr' or self.dataset == 'cora-d':
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_directed_data()
self.directed = True
else:
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(self.dataset)
if self.dataset == 'cora-dr': #reverse the direction
self.A = sc.sparse.csr_matrix(adj, dtype=np.float64).T
else:
self.A = sc.sparse.csr_matrix(adj, dtype=np.float64)
self.feature_data = sc.sparse.csr_matrix(features, dtype=np.float64)
self.feature_data_orig = self.feature_data.copy()
self.train_data = sc.sparse.csr_matrix(y_train, dtype=np.float64)
self.test_data = sc.sparse.csr_matrix(y_test, dtype=np.float64)
self.val_data = sc.sparse.csr_matrix(y_val, dtype=np.float64)
def define_comm_feature(self):
"""
Embedding the community labels in feature space
"""
self.comm_features = self.train_data.T.dot(self.feature_data)
norm = self.comm_features.sum(axis=1)
norm[norm==0] = 1
self.comm_features = sc.sparse.csr_matrix(self.comm_features/norm) #normalise for each node
def define_similarity_distribution(self, tpe = 'cos'):
"""
compute the prior probability distribution from the training data and the feautres
"""
if tpe == 'mlp': #use MLP for the prior distribution
if self.dataset == 'cora-d' or self.dataset == 'cora-dr':
self.similarity_orig = define_MLP_distribution('cora', directed = True)
else:
self.similarity_orig = define_MLP_distribution(self.dataset)
elif tpe == 'cos':
#calculating the cosine similarity between each node and the features that describe each community
self.similarity_orig = cosine_similarity(self.feature_data.toarray(), self.comm_features.toarray())
# Scaling the similarity of each node to 1, i.e. probability distribution
norm = np.tile(self.similarity_orig.sum(axis=1), (self.comm_features.shape[0],1)).T
norm[norm==0] = 1 #if not probabilities, don't divide
self.similarity_orig = sc.sparse.csr_matrix( self.similarity_orig / norm )
def precondition_laplacian(self, tpe = 'weighted' , disp = True):
"""
Compute a graph laplacian on the class space to smooth the prior distrubution
"""
G = nx.complete_graph(np.shape(self.similarity_orig)[1])
if tpe == 'weighted': #weight the preconditioning by similarities
for i, j in G.edges():
G[i][j]['weight'] = float(cosine_similarity([self.comm_features.toarray()[i,:],],[self.comm_features.toarray()[j,:],]))
else:
for i, j in G.edges():
G[i][j]['weight'] = 1.
self.L_precond = nx.laplacian_matrix(G) #combinatorial Laplacian matrix
#normalise with the second smallest eigavalue, so we always get relaxation time of 1
if np.shape(self.L_precond.toarray())[0]>3:
self.L_precond /= abs(sc.sparse.linalg.eigs(self.L_precond, which='SM', k=2)[0][1])
else:
print('implement spectral gap for small 3 feature (Pubmed)')
def precondition_distribution(self, d_t , disp=True):
"""
Predonditioning the prior distribution by smoothing it with a graph Laplacian
"""
exp = sc.linalg.expm(-d_t*self.L_precond.toarray())
node_label_dist = self.similarity_orig.toarray()#.dot(exp)
H = self.train_data.toarray().copy()
#Final Distribution sets the training nodes to their known probability distribution
# setting non-training nodes to their feature probability
indices = np.where(~H.any(axis=1))[0]
for row in indices:
H[row] = node_label_dist[row]
#if any rows have nan then we set them to equal probability across labels
index_nan = np.where(np.isnan(H[:,0]))[0]
for row in index_nan:
H[row] = 1/np.shape(H)[1]
self.similarity = H.copy()
# Now automatically calculate how accurate this prior distribution is
class_H = np.zeros_like(H)
class_H[np.arange(len(H)), H.argmax(1)] = 1
(test_y_pred, test_y_real, test_accuracy) = accuracy_predictions(class_H, self.test_data)
(val_y_pred, val_y_real, val_accuracy) = accuracy_predictions(class_H, self.val_data)
if disp:
print('----------------------------------')
print('Accuracy of prior cosine similarity distribution:')
print('Val accuracy score: ' + str(val_accuracy))
print('Test accuracy score: ' + str(test_accuracy))
print('----------------------------------')
return val_accuracy, test_accuracy
def Laplacian(self):
"""
Create the Laplacian matrix for the continuous random walk
"""
if self.directed: #uses the networkx 2.3 functions copied in utils.py
if self.tpe_rw == 'Combinatorial':
L = directed_combinatorial_laplacian_matrix(nx.DiGraph(self.A), walk_type='pagerank', alpha=0.85)
elif self.tpe_rw == 'Normalized':
L = directed_laplacian_matrix(nx.DiGraph(self.A), walk_type='pagerank', alpha=0.1)
else:
G = nx.Graph(self.A)
graphs = [G.subgraph(c).copy() for c in nx.connected_components(G)]
if self.tpe_rw == 'Combinatorial':
#combinatorial random walk
L = nx.laplacian_matrix(G)
self.v = np.zeros(len(G))
for graph in graphs:
self.v[list(graph.nodes)] = np.ones(len(graph.nodes))/len(graph.nodes)
elif self.tpe_rw == 'Normalized':
#normalized random walk
degree = np.array(self.A.sum(1)).flatten()
L = sc.sparse.csr_matrix(nx.laplacian_matrix(G).toarray().dot(np.diag(1./degree)))
self.v = np.zeros(len(G))
for graph in graphs:
self.v[list(graph.nodes)] = degree[list(graph.nodes)]/degree[list(graph.nodes)].sum()*0.5
elif self.tpe_rw == 'max_entropy':
#maximum entropy random walk
self.v = np.zeros(len(G))
L = np.zeros([len(G),len(G)])
A = nx.adjacency_matrix(G).toarray()
for graph in graphs:
A_tmp = A[np.ix_(list(graph.nodes), list(graph.nodes))]
eigs = sc.sparse.linalg.eigsh(1.*A_tmp, which='LM', k=1)
psi = eigs[1][:,0]
self.v[list(graph.nodes)] = psi**2
lamb_0 = abs(eigs[0][0])
L[np.ix_(list(graph.nodes),list(graph.nodes))] = np.eye(len(graph.nodes)) - np.diag(psi).dot(A_tmp).dot(np.diag(1./psi))/lamb_0
L = sc.sparse.csc_matrix(L)
#normalise with the second smallest eigenvalue, so we always get relaxation time of 1 (for the largest connected component)
if self.directed:
L_sub = L
else: #get the largest connected component first
graphs = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)
print(len(graphs), 'connected components, we rescale L with the largest one only')
L_sub = sc.sparse.csc_matrix(L.toarray()[np.ix_(graphs[0].nodes, graphs[0].nodes)])
l_2 = abs(sc.sparse.linalg.eigs(L_sub, which='SM', k=2)[0][1])
print("l_2 = ", l_2)
self.L = L / l_2
def apply_exponential(self, disp=True):
"""
Compute the propagation of labels across the network
"""
self.H_list = np.array(sc.sparse.linalg.expm_multiply( -self.L, self.similarity, self.t_min, self.t_max, self.N_t))
self.times = np.linspace(self.t_min, self.t_max, self.N_t)
def label_prediction_prior(self, disp = False):
best_classes_prior = np.argmax(self.similarity, axis=1)# np.unravel_index(np.argmax(node_class_tmp, axis=1), node_class_tmp.shape)
class_H_prior = np.zeros_like(self.similarity) #array to collect the predicted classes
class_H_prior[np.arange(len(best_classes_prior)), best_classes_prior] = 1 #finally, set the predicted classes
#compute the prior accuracies
(train_y_pred,train_y_real,train_accuracy) = accuracy_predictions(class_H_prior, self.train_data.toarray())
(val_y_pred,val_y_real,val_accuracy) = accuracy_predictions(class_H_prior, self.val_data.toarray())
(test_y_pred,test_y_real,test_accuracy) = accuracy_predictions(class_H_prior, self.test_data.toarray())
if disp:
print('Prior distribution before random-walk:')
print('Training accuracy score: ' + str(train_accuracy))
print('Validation accuracy score: ' + str(val_accuracy))
print('Testing accuracy score: ' + str(test_accuracy))
return val_accuracy, test_accuracy
def label_prediction_overshoot(self, H_list, disp=True):
"""
find the best class of each node from overshooting
"""
#stationary state of each class
G = nx.Graph(self.A)
graphs = sorted(nx.connected_component_subgraphs(G), key = len, reverse=True)
H_list_inf = np.zeros([len(G), np.shape(self.similarity)[1]])
for graph in graphs:
m = self.similarity[list(graph.nodes)].sum(0)
H_list_inf[list(graph.nodes)] = self.v[list(graph.nodes), np.newaxis]*m[np.newaxis,:]
overshoots = np.max(H_list - H_list_inf[np.newaxis, :, :], axis=0) #find the overshooting value of each class/node
H_list_no_overshoots = H_list.copy()
H_list_no_overshoots[:, overshoots <= 0 ] = 0 #set to 0 the non-overshooting classes
no_os_id = np.argwhere(H_list_no_overshoots[0,:,:].all(axis=1) == 0) #find the nodes with no overshooting classes
class_0 = H_list[0].argmax(1) #class prediction at time 0 (min time)
H_list_no_overshoots[:, no_os_id, class_0[no_os_id] ] = 1 #set the best class of to be the initial one
node_class_tmp = np.max(H_list_no_overshoots, axis=0) #find the max values accross all times
best_classes = np.argmax(node_class_tmp, axis=1)# np.unravel_index(np.argmax(node_class_tmp, axis=1), node_class_tmp.shape)
class_H = np.zeros_like(H_list[0]) #array to collect the predicted classes
class_H[np.arange(len(best_classes)), best_classes] = 1 #finally, set the predicted classes
#set the training nodes back
class_H[self.train_data.toarray().any(axis=1)>0] = self.train_data.toarray()[self.train_data.toarray().any(axis=1)>0]
#compute the accuracies
(train_y_pred,train_y_real,train_accuracy) = accuracy_predictions(class_H, self.train_data.toarray())
(val_y_pred,val_y_real,val_accuracy) = accuracy_predictions(class_H, self.val_data.toarray())
(test_y_pred,test_y_real,test_accuracy) = accuracy_predictions(class_H, self.test_data.toarray())
if disp:
print('Cosine Similarity prior distribution after random-walk:')
print('Training accuracy score: ' + str(train_accuracy))
print('Validation accuracy score: ' + str(val_accuracy))
print('Testing accuracy score: ' + str(test_accuracy))
return val_accuracy, test_accuracy
def scan_tmin(self, n_min, output_full = True):
"""
scan for t_min, and find the best one from validation set
"""
Val = np.zeros(n_min)
Test = np.zeros(n_min)
val_prior, test_prior = self.label_prediction_prior()
for i in range(n_min):
val, test = self.label_prediction_overshoot(self.H_list[i:], disp=False)
Val[i] = val
Test[i] = test
if output_full:
return np.argmax(Val), Val, Test, val_prior, test_prior
else:
return Val[np.argmax(Val)], Test[np.argmax(Val)], val_prior, test_prior