forked from RobeeF/M1DGMM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparameter_selection.py
236 lines (176 loc) · 9.14 KB
/
parameter_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 2 15:39:05 2020
@author: rfuchs
"""
from data_preprocessing import bin_to_bern
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, Lasso
# Dirty local hard copy of the Github bevel package
from bevel.linear_ordinal_regression import OrderedLogit
import autograd.numpy as np
def rl1_selection(y_bin, y_ord, y_categ, y_cont, zl1_ys, w_s):
''' Selects the number of factors on the first latent discrete layer
y_bin (n x p_bin ndarray): The binary and count data matrix
y_ord (n x p_ord ndarray): The ordinal data matrix
y_categ (n x p_categ ndarray): The categorical data matrix
y_cont (n x p_cont ndarray): The continuous data matrix
zl1_ys (k_1D x r_1D ndarray): The first layer latent variables
w_s (list): The path probabilities starting from the first layer
------------------------------------------------------------------
return (list of int): The dimensions to keep for the GLLVM layer
'''
M0 = zl1_ys.shape[0]
numobs = zl1_ys.shape[1]
r0 = zl1_ys.shape[2]
S0 = zl1_ys.shape[3]
nb_bin = y_bin.shape[1]
nb_ord = y_ord.shape[1]
nb_categ = y_categ.shape[1]
nb_cont = y_cont.shape[1]
PROP_ZERO_THRESHOLD = 0.25
PVALUE_THRESHOLD = 0.10
# Detemine the dimensions that are weakest for Binomial variables
zero_coef_mask = np.zeros(r0)
for j in range(nb_bin):
for s in range(S0):
Nj = int(np.max(y_bin[:,j])) # The support of the jth binomial is [1, Nj]
if Nj == 1: # If the variable is Bernoulli not binomial
yj = y_bin[:,j]
z = zl1_ys[:,:,:,s]
else: # If not, need to convert Binomial output to Bernoulli output
yj, z = bin_to_bern(Nj, y_bin[:,j], zl1_ys[:,:,:,s])
# Put all the M0 points in a series
X = z.flatten(order = 'C').reshape((M0 * numobs * Nj, r0), order = 'C')
y_repeat = np.repeat(yj, M0).astype(int) # Repeat rather than tile to check
lr = LogisticRegression(penalty = 'l1', solver = 'saga')
lr.fit(X, y_repeat)
zero_coef_mask += (lr.coef_[0] == 0) * w_s[s]
# Detemine the dimensions that are weakest for Ordinal variables
for j in range(nb_ord):
for s in range(S0):
ol = OrderedLogit()
X = zl1_ys[:,:,:,s].flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C')
y_repeat = np.repeat(y_ord[:, j], M0).astype(int) # Repeat rather than tile to check
ol.fit(X, y_repeat)
zero_coef_mask += np.array(ol.summary['p'] > PVALUE_THRESHOLD) * w_s[s]
# Detemine the dimensions that are weakest for Categorical variables
for j in range(nb_categ):
for s in range(S0):
z = zl1_ys[:,:,:,s]
# Put all the M0 points in a series
X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C')
y_repeat = np.repeat(y_categ[:,j], M0).astype(int) # Repeat rather than tile to check
lr = LogisticRegression(penalty = 'l1', solver = 'saga', \
multi_class = 'multinomial')
lr.fit(X, y_repeat)
zero_coef_mask += (lr.coef_[0] == 0) * w_s[s]
# Detemine the dimensions that are weakest for Continuous variables
for j in range(nb_cont):
for s in range(S0):
z = zl1_ys[:,:,:,s]
# Put all the M0 points in a series
X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C')
y_repeat = np.repeat(y_cont[:,j], M0) # Repeat rather than tile to check
linr = Lasso()
linr.fit(X, y_repeat)
#coefs = np.concatenate([[linr.intercept_], linr.coef_])
#zero_coef_mask += (coefs == 0) * w_s[s]
zero_coef_mask += (linr.coef_[0] == 0) * w_s[s]
# Voting: Delete the dimensions which have been zeroed a majority of times
zeroed_coeff_prop = zero_coef_mask / ((nb_ord + nb_bin + nb_categ + nb_cont))
# Need at least r1 = 2 for algorithm to work
new_rl = np.sum(zeroed_coeff_prop <= PROP_ZERO_THRESHOLD)
if new_rl < 2:
dims_to_keep = np.argsort(zeroed_coeff_prop)[:2]
else:
dims_to_keep = list(set(range(r0)) - \
set(np.where(zeroed_coeff_prop > PROP_ZERO_THRESHOLD)[0].tolist()))
dims_to_keep = np.sort(dims_to_keep)
return dims_to_keep
def other_r_selection(rl1_select, z2_z1s):
''' Chose the meaningful dimensions from the second layer to the end of the network
rl1_select (list): The dimension kept over the first layer
z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s
--------------------------------------------------------------------------
return (list of int): The dimensions to keep from the second layer of the network
'''
S = [zz.shape[2] for zz in z2_z1s] + [1]
CORR_THRESHOLD = 0.20
L = len(z2_z1s)
M = np.array([zz.shape[0] for zz in z2_z1s] + [z2_z1s[-1].shape[1]])
prev_new_r = [len(rl1_select)]
dims_to_keep = []
for l in range(L):
# Will not keep the following layers if one of the previous layer is of dim 1
if prev_new_r[l] <= 1:
dims_to_keep.append([])
prev_new_r.append(0)
else:
old_rl = z2_z1s[l].shape[-1]
corr = np.zeros(old_rl)
for s in range(S[l]):
for m1 in range(M[l + 1]):
pca = PCA(n_components=1)
pca.fit_transform(z2_z1s[l][m1, :, s])
corr += np.abs(pca.components_[0])
average_corr = corr / (S[l] * M[l + 1])
new_rl = np.sum(average_corr > CORR_THRESHOLD)
if prev_new_r[l] > new_rl: # Respect r1 > r2 > r3 ....
wanted_dims = np.where(average_corr > CORR_THRESHOLD)[0].tolist()
wanted_dims = np.sort(wanted_dims)
dims_to_keep.append(wanted_dims)
else: # Have to delete other dimensions to match r1 > r2 > r3 ....
nb_dims_to_remove = old_rl - prev_new_r[l] + 1
unwanted_dims = np.argpartition(average_corr, nb_dims_to_remove)[:nb_dims_to_remove]
wanted_dims = list(set(range(old_rl)) - set(unwanted_dims))
wanted_dims = np.sort(wanted_dims)
dims_to_keep.append(wanted_dims)
new_rl = len(wanted_dims)
prev_new_r.append(new_rl)
return dims_to_keep
def r_select(y_bin, y_ord, y_categ, y_cont, zl1_ys, z2_z1s, w_s):
''' Automatic choice of dimension of all layer components
y_bin (numobs x nb_bin nd-array): The binary/count data
y_ord (numobs x nb_ord nd-array): The ordinal data
y_categ (numobs x nb_categ nd-array): The categorical data
yc (numobs x nb_categ nd-array): The continuous data
zl1_ys (ndarray): The latent variable of the first layer
z2_z1s (list of ndarray): z^{l+1} | z^{l}, s, Theta for all (l,s)
w_s (list): The path probabilities for all s in [1,S]
--------------------------------------------------------------------------
returns (dict): The dimensions kept for all the layers of the network
'''
rl1_select = rl1_selection(y_bin, y_ord, y_categ, y_cont, zl1_ys, w_s)
other_r_select = other_r_selection(rl1_select, z2_z1s)
return [rl1_select] + other_r_select
def k_select(w_s, k, new_L, clustering_layer, mode_auto):
''' Automatic choice of the number of components by layer
w_s (list): The path probabilities for all s in [1,S]
k (dict): The number of component on each layer
new_L (int): The selected total number of layers.
clustering_layer (int): The index of the clustering layer
--------------------------------------------------------------------------
returns (dict): The components kept for all the layers of the network
'''
L = len(k)
n_clusters = k[clustering_layer]
# If the clustering layer (cl) is deleted, define the last existing layer as cl
last_layer_idx = new_L - 1
if last_layer_idx < clustering_layer:
clustering_layer = last_layer_idx
components_to_keep = []
w = w_s.reshape(*k, order = 'C')
print(w)
for l in range(new_L):
PROBA_THRESHOLD = 1 / (k[l] * 4)
other_layers_indices = tuple(set(range(L)) - set([l]))
components_proba = w.sum(other_layers_indices)
if (l == clustering_layer) & (mode_auto == False):
biggest_lik_comp = np.sort(components_proba.argsort()[::-1][:n_clusters])
components_to_keep.append(biggest_lik_comp)
else:
comp_kept = np.where(components_proba > PROBA_THRESHOLD)[0]
comp_kept = np.sort(comp_kept)
components_to_keep.append(comp_kept)
return components_to_keep