-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgroup_split.py
334 lines (274 loc) · 11.1 KB
/
group_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import numpy as np
import time
from numpy.random import default_rng
from numba import jit
from collections import namedtuple
from tools.functions import index_to_str
RANDOM_SEED = 23
Solution = namedtuple("Solution", "cost index")
def create_initial_solution(sample_counts, p):
"""
Creates the initial solution using a random shuffle
@param sample_counts: The problem array.
@param p: The validation set proportion.
@return: A solution array.
"""
rng = default_rng(seed=RANDOM_SEED)
group_count = sample_counts.shape[0]
idx = np.zeros(group_count, dtype=bool)
rnd_idx = rng.permutation(group_count)
start_count = 0
sample_size = sample_counts[:, 0].sum()
for i in range(group_count):
start_count += sample_counts[rnd_idx[i], 0]
idx[rnd_idx[i]] = True
if start_count > sample_size * p:
break
return idx
@jit(nopython=True)
def calculate_cost(sample_counts, idx, p):
"""
Calculates the cost of a given solution
@param sample_counts: The problem array.
@param idx: The solution array to evaluate.
@param p: The train/validation split proportion.
@return: The cost value.
"""
train_count = sample_counts[~idx, 0].sum()
valid_count = sample_counts[idx, 0].sum()
total_count = train_count + valid_count
cost = (valid_count - total_count * p) ** 2
for i in range(1, sample_counts.shape[1]):
r = sample_counts[:, i].sum() / total_count
cost += (sample_counts[idx, i].sum() - r * sample_counts[idx, 0].sum()) ** 2
return cost / 2.0
def calculate_cost_grad(sample_counts, idx, p):
"""
Calculates the cost gradient of a given solution
@param sample_counts: The problem array.
@param idx: The solution array to evaluate.
@param p: The train/validation split proportion.
@return: The cost value.
"""
grad = np.zeros(sample_counts.shape[1])
total_count = sample_counts[:, 0].sum()
valid_count = sample_counts[idx, 0].sum()
grad[0] = total_count * p - valid_count
for i in range(1, sample_counts.shape[1]):
r = sample_counts[:, i].sum() / total_count
grad[i] = r * sample_counts[idx, 0].sum() - sample_counts[idx, i].sum()
return grad
def cosine_similarity(sample_counts, idx, cost_grad):
"""
Calculates the cosine similarity vector between the problem array
and the cost gradient vector
@param sample_counts: The problem array.
@param idx: The solution vector.
@param cost_grad: The cost gradient vector.
@return: The cosine similarity vector.
"""
c = np.copy(sample_counts)
c[idx] = -c[idx] # Reverse direction of validation vectors
a = np.inner(c, cost_grad)
b = np.multiply(np.linalg.norm(c, axis=1), np.linalg.norm(cost_grad))
return np.divide(a, b)
def euclidean_similarity(sample_counts, idx, cost_grad):
c = np.copy(sample_counts)
c[idx] = -c[idx]
return np.linalg.norm(c - cost_grad, axis=1)
def generate_cosine_move(sample_counts, idx, p, expanded_set, intensify):
"""
Generates a new move using the cosine similarity.
@param sample_counts: The problem array.
@param idx: The solution vector.
@param p: The validation set proportion.
@param expanded_set: The set of expanded solutions.
@param intensify: Intensification / diversification flag.
@return: The new solution vector.
"""
cost_grad = calculate_cost_grad(sample_counts, idx, p)
similarity = cosine_similarity(sample_counts, idx, cost_grad)
sorted_ixs = np.argsort(similarity)
if intensify:
sorted_ixs = np.flip(sorted_ixs)
for i in sorted_ixs:
move = np.copy(idx)
move[i] = not move[i]
if index_to_str(move) not in expanded_set:
return move
return None
def generate_euclidean_move(sample_counts, idx, p, expanded_set, get_min):
cost_grad = calculate_cost_grad(sample_counts, idx, p)
similarity = euclidean_similarity(sample_counts, idx, cost_grad)
sorted_ixs = np.argsort(similarity)
if not get_min:
sorted_ixs = np.flip(sorted_ixs)
for i in sorted_ixs:
move = np.copy(idx)
move[i] = not move[i]
if index_to_str(move) not in expanded_set:
return move
return None
def generate_moves(idx, expanded_set):
"""
Generator for all acceptable moves from a previous solution.
@param idx: The solution vector.
@param expanded_set: The set of expanded solutions.
"""
for i in range(idx.shape[0]):
move = np.copy(idx)
move[i] = not move[i]
if index_to_str(move) not in expanded_set:
yield move
def generate_counts(num_groups, num_classes,
min_group_size, max_group_size,
max_group_percent):
"""
Generates a problem matrix from the given parameters.
@param num_groups: The number of data groups.
@param num_classes: The number of classes.
@param min_group_size: The minimum group size.
@param max_group_size: The maximum group size.
@param max_group_percent: The maximum class percent.
@return: The problem matrix.
"""
rng = default_rng(seed=RANDOM_SEED)
sample_cnt = np.zeros((num_groups, num_classes), dtype=int)
sample_cnt[:, 0] = rng.integers(low=min_group_size, high=max_group_size, size=num_groups)
for i in range(1, num_groups):
for j in range(1, num_classes):
sample_cnt[i, j] = rng.integers(low=0,
high=max_group_percent * sample_cnt[i, 0] - sample_cnt[i, 1:j+1].sum())
return sample_cnt
class BaseSolver(object):
def __init__(self, problem, candidate, p):
self.problem = problem
self.p = p
self.incumbent = Solution(calculate_cost(problem, candidate, p), candidate)
class SearchSolver(BaseSolver):
def __init__(self, problem, candidate, p):
super(SearchSolver, self).__init__(problem, candidate, p)
def solve(self, min_cost, max_empty_iterations,
max_intensity_iterations, verbose=True):
"""
Uses the search solver to calculate the best split.
@param min_cost: Minimum cost criterion.
@param max_empty_iterations: Maximum number of non-improving iterations.
@param max_intensity_iterations: Maximum number of intensity iterations.
@param verbose: Verbose flag.
@return: The incumbent solution.
"""
terminated = False
intensify = True
expanded_set = set()
solution = self.incumbent
n = 0
n_intensity = 0
while not terminated:
move_list = generate_moves(solution.index, expanded_set)
cost_list = [Solution(calculate_cost(self.problem, move, self.p), move)
for move in move_list]
cost_list.sort(key=lambda t: t[0], reverse=not intensify)
intensify = True
if len(cost_list) > 0:
solution = cost_list[0]
expanded_set.add(index_to_str(solution.index))
if solution.cost < self.incumbent.cost:
self.incumbent = solution
n = 0
n_intensity = 0
intensify = True
if verbose:
print(self.incumbent.cost)
else:
# Diversify?
if n_intensity > max_intensity_iterations:
intensify = False
n_intensity = 0
else:
terminated = True
n += 1
n_intensity += 1
if n > max_empty_iterations or self.incumbent.cost < min_cost:
terminated = True
return self.incumbent
class GradientSolver(BaseSolver):
def __init__(self, problem, candidate, p):
super(GradientSolver, self).__init__(problem, candidate, p)
def solve(self, min_cost, max_empty_iterations,
max_intensity_iterations, verbose=True):
"""
This function uses the gradient solver to calculate the best split.
@param min_cost: Minimum cost criterion.
@param max_empty_iterations: Maximum number of non-improving iterations.
@param max_intensity_iterations: Maximum number of intensity iterations.
@param verbose: Verbose flag.
@return: The incumbent solution.
"""
terminated = False
intensify = True
expanded_set = set()
solution = self.incumbent
n = 0
n_intensity = 0
while not terminated:
move = generate_cosine_move(self.problem, solution.index, self.p,
expanded_set, intensify)
intensify = True
if move is not None:
solution = Solution(calculate_cost(self.problem, move, self.p), move)
expanded_set.add(index_to_str(solution.index))
if solution.cost < self.incumbent.cost:
self.incumbent = solution
n = 0
n_intensity = 0
if verbose:
print(self.incumbent.cost)
else:
if n_intensity > max_intensity_iterations:
intensify = False
n_intensity = 0
else:
terminated = True
n += 1
n_intensity += 1
if n > max_empty_iterations or self.incumbent.cost < min_cost:
terminated = True
return self.incumbent
def print_solution(problem, solution, p):
idx = solution.index
valid_count = problem[idx, 0].sum()
train_count = problem[~idx, 0].sum()
total_count = train_count + valid_count
print(solution.cost)
print(p, valid_count / total_count)
for i in range(1, problem.shape[1]):
r = problem[:, i].sum() / total_count
print(r, problem[idx, i].sum() / problem[idx, 0].sum())
def main():
num_groups = 250 # Number of groups to simulate
num_classes = 2 # Number of classes
max_group_size = 10000 # Maximum group size
max_group_perc = 0.4 # Maximum proportion for each class
p = 0.3 # Validation split proportion
max_empty_iterations = 100
max_intensity_iterations = 10
min_cost = 10000
sample_cnt = generate_counts(num_groups, num_classes,
min_group_size=10,
max_group_size=max_group_size,
max_group_percent=max_group_perc)
solution_arr = create_initial_solution(sample_cnt, p)
s_solver = SearchSolver(sample_cnt, solution_arr, p)
g_solver = GradientSolver(sample_cnt, solution_arr, p)
start = time.time()
solution = s_solver.solve(min_cost, max_empty_iterations, max_intensity_iterations, verbose=False)
print(time.time() - start)
print_solution(sample_cnt, solution, p)
print()
start = time.time()
solution = g_solver.solve(min_cost, max_empty_iterations, max_intensity_iterations, verbose=False)
print(time.time() - start)
print_solution(sample_cnt, solution, p)
if __name__ == "__main__":
main()