-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
118 lines (97 loc) · 3.98 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Copyright (c) 2019 Robert Bosch GmbH
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# ******************************************************************
# util.py
# Utilities for the MetaBO framework.
# ******************************************************************
import numpy as np
import scipy as sp
from scipy.stats import norm
def get_meta_data(path):
# Sample data for each base task
import numpy as np
import pandas as pd
df = pd.read_csv(path, sep=',', low_memory=False)
df.sort_values(by=['Workload', 'dataset'])
group = df.groupby(['Workload', 'dataset'])
task_size = [int(i) for i in group.size()]
data_by_task = {}
start = 0
print(df.loc(start))
for task in range(len(task_size)):
taskworkload = df.loc[start]['Workload']
taskdatasize = df.loc[start]['dataset']
train_y = []
configurations = []
for j in range(start, start+task_size[task]):
train_y.append(float(df.loc[j]['NDCG']))
conf = []
conf.append(float(df.loc[j]['ld']))
conf.append(float(df.loc[j]['lr']))
conf.append(float(df.loc[j]['bs']))
configurations.append(conf)
train_y = np.array(train_y)
data_by_task[task] = {
'workload': taskworkload,
'datasize': taskdatasize,
'configurations': configurations,
'y': train_y,
}
start = start + task_size[task]
return data_by_task
def copula_transform(values: np.ndarray) -> np.ndarray:
"""Copula transformation from "A Quantile-based Approach for Hyperparameter Transfer Learning"
by Salinas, Shen and Perrone, ICML 2020"""
quants = (sp.stats.rankdata(values.flatten()) - 1) / (len(values) - 1)
cutoff = 1 / (4 * np.power(len(values), 0.25) * np.sqrt(np.pi * np.log(len(values))))
quants = np.clip(quants, a_min=cutoff, a_max=1-cutoff)
# Inverse Gaussian CDF
rval = np.array([sp.stats.norm.ppf(q) for q in quants]).reshape((-1, 1))
return rval
def create_uniform_grid(domain, N_samples_dim):
D = domain.shape[0]
x_grid = []
for i in range(D):
x_grid.append(np.linspace(domain[i, 0], domain[i, 1], N_samples_dim))
X_mesh = np.meshgrid(*x_grid)
X = np.vstack(X_mesh).reshape((D, -1)).T
return X, X_mesh
def create_random_x(D, samples):
return np.random.rand(samples, D)
def scale_from_unit_square_to_domain(X, domain):
# X contains elements in unit square, stretch and translate them to lie domain
return X * domain.ptp(axis=1) + domain[:, 0]
def scale_from_domain_to_unit_square(X, domain):
# X contains elements in domain, translate and stretch them to lie in unit square
return (X - domain[:, 0]) / domain.ptp(axis=1)
def get_cube_around(X, diam, domain):
assert X.ndim == 1
assert domain.ndim == 2
cube = np.zeros(domain.shape)
cube[:, 0] = np.max((X - 0.5 * diam, domain[:, 0]), axis=0)
cube[:, 1] = np.min((X + 0.5 * diam, domain[:, 1]), axis=0)
return cube
if __name__ == '__main__':
# D = 15
# N_MS = 300000
# x = create_random_x(D, 32768)
# N_MS_per_dim = np.int(np.floor(N_MS ** (1 / D)))
# domain = np.stack([np.zeros((15,)), np.ones(15, )], axis=1)
# x, x_mesh = create_uniform_grid(domain, 2)
#
# res = scale_from_domain_to_unit_square(np.array([1, 2]), domain)
#
# print()
print(get_meta_data('test_metabo_0702.csv'))