Skip to content

Commit 5ca8026

Browse files
authored
Merge pull request #20 from lcmd-epfl/regression-new-features
Add new features to regression tools - sparse regression - random state, read kernel, return prediction arguments
2 parents 11b3191 + 99e9e5f commit 5ca8026

File tree

5 files changed

+116
-41
lines changed

5 files changed

+116
-41
lines changed

qstack/math/fps.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import numpy as np
2+
3+
def do_fps(x, d=0):
4+
# Code from Giulio Imbalzano
5+
n = len(x)
6+
if d==0:
7+
d = n
8+
iy = np.zeros(d,int)
9+
measure = np.zeros(d-1,float)
10+
iy[0] = 0
11+
# Faster evaluation of Euclidean distance
12+
n2 = np.sum(x*x, axis=1)
13+
dl = n2 + n2[iy[0]] - 2.0*np.dot(x,x[iy[0]])
14+
for i in range(1,d):
15+
iy[i], measure[i-1] = np.argmax(dl), np.amax(dl)
16+
nd = n2 + n2[iy[i]] - 2.0*np.dot(x,x[iy[i]])
17+
dl = np.minimum(dl,nd)
18+
return iy, measure

qstack/regression/final_error.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,34 @@
55
from qstack.regression.kernel_utils import get_kernel, defaults
66
from qstack.tools import correct_num_threads
77

8-
def final_error(X, y, sigma=defaults.sigma, eta=defaults.eta, akernel=defaults.kernel, test_size=defaults.test_size, save_alpha=None):
8+
def final_error(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta, akernel=defaults.kernel,
9+
test_size=defaults.test_size,
10+
random_state=defaults.random_state,
11+
return_pred=False, save_alpha=None):
912
"""
1013
1114
.. todo::
1215
Write the docstring
1316
"""
14-
kernel = get_kernel(akernel)
15-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
16-
K_all = kernel(X_train, X_train, 1.0/sigma)
17-
Ks_all = kernel(X_test, X_train, 1.0/sigma)
17+
if read_kernel is False:
18+
kernel = get_kernel(akernel)
19+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
20+
K_all = kernel(X_train, X_train, 1.0/sigma)
21+
Ks_all = kernel(X_test, X_train, 1.0/sigma)
22+
else:
23+
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=random_state)
24+
K_all = X[np.ix_(idx_train,idx_train)]
25+
Ks_all = X[np.ix_(idx_test, idx_train)]
1826
K_all[np.diag_indices_from(K_all)] += eta
1927
alpha = scipy.linalg.solve(K_all, y_train, assume_a='pos')
2028
y_kf_predict = np.dot(Ks_all, alpha)
2129
aes = np.abs(y_test-y_kf_predict)
22-
if save_alpha: np.save(save_alpha, alpha)
23-
return aes
30+
if save_alpha:
31+
np.save(save_alpha, alpha)
32+
if return_pred:
33+
return aes, y_kf_predict
34+
else:
35+
return aes
2436

2537
def main():
2638
import sys
@@ -34,12 +46,13 @@ def main():
3446
parser.add_argument('--kernel', type=str, dest='kernel', default=defaults.kernel, help='kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
3547
parser.add_argument('--save-alpha', type=str, dest='save_alpha', default=None, help='file to write the regression coefficients to (default None)')
3648
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
49+
parser.add_argument('--random_state', type=int, dest='random_state', default=defaults.random_state, help='random state for test / train splitting')
3750
args = parser.parse_args()
3851
print(vars(args))
3952
if(args.ll): correct_num_threads()
4053
X = np.load(args.repr)
4154
y = np.loadtxt(args.prop)
42-
aes = final_error(X, y, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, save_alpha=args.save_alpha)
55+
aes = final_error(X, y, sigma=args.sigma, eta=args.eta, akernel=args.kernel, test_size=args.test_size, save_alpha=args.save_alpha, random_state=random_state)
4356
np.savetxt(sys.stdout, aes, fmt='%e')
4457

4558
if __name__ == "__main__":

qstack/regression/hyperparameters.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,38 @@
66
from sklearn.model_selection import train_test_split, KFold
77
from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs
88
from qstack.tools import correct_num_threads
9+
from qstack.math.fps import do_fps
910

1011
def hyperparameters(X, y,
1112
sigma=defaults.sigmaarr, eta=defaults.etaarr, gkernel=defaults.gkernel, gdict=defaults.gdict,
1213
akernel=defaults.kernel, test_size=defaults.test_size, splits=defaults.splits,
13-
printlevel=0, adaptive=False, read_kernel=False):
14+
printlevel=0, adaptive=False, read_kernel=False, sparse=None):
1415
"""
1516
1617
.. todo::
1718
Write the docstring
1819
"""
1920

20-
def k_fold_opt(K_all):
21+
def k_fold_opt(K_all, eta):
2122
kfold = KFold(n_splits=splits, shuffle=False)
2223
all_maes = []
2324
for train_idx, test_idx in kfold.split(X_train):
2425
y_kf_train, y_kf_test = y_train[train_idx], y_train[test_idx]
25-
K = K_all [np.ix_(train_idx,train_idx)]
26-
Ks = K_all [np.ix_(test_idx,train_idx)]
26+
27+
if not sparse:
28+
K_solve = np.copy(K_all [np.ix_(train_idx,train_idx)])
29+
K_solve[np.diag_indices_from(K_solve)] += eta
30+
y_solve = y_kf_train
31+
Ks = K_all [np.ix_(test_idx,train_idx)]
32+
else:
33+
K_NM = K_all [np.ix_(train_idx,sparse_idx)]
34+
K_solve = K_NM.T @ K_NM
35+
K_solve[np.diag_indices_from(K_solve)] += eta
36+
y_solve = K_NM.T @ y_kf_train
37+
Ks = K_all [np.ix_(test_idx,sparse_idx)]
38+
2739
try:
28-
alpha = scipy.linalg.solve(K, y_kf_train, assume_a='pos', overwrite_a=True)
40+
alpha = scipy.linalg.solve(K_solve, y_solve, assume_a='pos', overwrite_a=True)
2941
except scipy.linalg.LinAlgError:
3042
print('singular matrix')
3143
all_maes.append(np.nan)
@@ -43,9 +55,7 @@ def hyper_loop(sigma, eta):
4355
K_all = X_train
4456

4557
for e in eta:
46-
K_all[np.diag_indices_from(K_all)] += e
47-
mean, std = k_fold_opt(K_all)
48-
K_all[np.diag_indices_from(K_all)] -= e
58+
mean, std = k_fold_opt(K_all, e)
4959
if printlevel>0 :
5060
sys.stderr.flush()
5161
print(s, e, mean, std, flush=True)
@@ -63,6 +73,11 @@ def hyper_loop(sigma, eta):
6373
X_train = X[np.ix_(idx_train,idx_train)]
6474
sigma = [np.nan]
6575

76+
if sparse:
77+
if read_kernel:
78+
raise RuntimeError('Cannot do FPS with kernels')
79+
sparse_idx = do_fps(X_train)[0][:sparse]
80+
6681
work_sigma = sigma
6782
errors = []
6883
direction = None
@@ -111,14 +126,16 @@ def main():
111126
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
112127
parser.add_argument('--ada', action='store_true', dest='adaptive', default=False, help='if adapt sigma')
113128
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
129+
parser.add_argument('--sparse', type=int, dest='sparse', default=None, help='regression basis size for sparse learning')
114130
args = parser.parse_args()
115131
if(args.readk): args.sigma = [np.nan]
116132
print(vars(args))
117133
if(args.ll): correct_num_threads()
118134

119135
X = np.load(args.repr)
120136
y = np.loadtxt(args.prop)
121-
errors = hyperparameters(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.akernel, test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive)
137+
errors = hyperparameters(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.akernel, sparse=args.sparse,
138+
test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive)
122139

123140
print()
124141
print('error stdev eta sigma')

qstack/regression/kernel_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ def __call__(self, parser, namespace, values, option_string=None):
3030
train_size=[0.125, 0.25, 0.5, 0.75, 1.0],
3131
etaarr=list(numpy.logspace(-10, 0, 5)),
3232
sigmaarr=list(numpy.logspace(0,6, 13)),
33-
sigmaarr_mult=list(numpy.logspace(0,2, 5))
33+
sigmaarr_mult=list(numpy.logspace(0,2, 5)),
34+
random_state=0,
3435
)
3536

3637

qstack/regression/regression.py

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,36 @@
55
from sklearn.model_selection import train_test_split
66
from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs
77
from qstack.tools import correct_num_threads
8+
from qstack.math.fps import do_fps
89

9-
def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta, akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict, test_size=defaults.test_size, train_size=defaults.train_size, n_rep=defaults.n_rep, debug=False):
10+
11+
def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta,
12+
akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict,
13+
test_size=defaults.test_size, train_size=defaults.train_size, n_rep=defaults.n_rep,
14+
random_state=defaults.random_state,
15+
sparse=None, debug=False):
1016
"""
11-
17+
1218
.. todo::
1319
Write the docstring
1420
"""
1521
if read_kernel is False:
1622
kernel = get_kernel(akernel, [gkernel, gdict])
17-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
23+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
1824
K_all = kernel(X_train, X_train, 1.0/sigma)
1925
Ks_all = kernel(X_test, X_train, 1.0/sigma)
2026
else:
21-
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=0)
27+
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=random_state)
2228
K_all = X[np.ix_(idx_train,idx_train)]
2329
Ks_all = X[np.ix_(idx_test, idx_train)]
2430

25-
K_all[np.diag_indices_from(K_all)] += eta
2631
all_indices_train = np.arange(len(y_train))
32+
if not sparse:
33+
K_all[np.diag_indices_from(K_all)] += eta
34+
else:
35+
if read_kernel:
36+
raise RuntimeError('Cannot do FPS with kernels')
37+
sparse_idx = do_fps(X_train)[0][:sparse]
2738

2839
if debug:
2940
np.random.seed(666)
@@ -35,40 +46,55 @@ def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta,
3546
for rep in range(n_rep):
3647
train_idx = np.random.choice(all_indices_train, size = size_train, replace=False)
3748
y_kf_train = y_train[train_idx]
38-
K = K_all [np.ix_(train_idx,train_idx)]
39-
Ks = Ks_all[:,train_idx]
40-
alpha = scipy.linalg.solve(K, y_kf_train, assume_a='pos')
49+
50+
if not sparse:
51+
K_solve = K_all [np.ix_(train_idx,train_idx)]
52+
y_solve = y_kf_train
53+
Ks = Ks_all[:,train_idx]
54+
else:
55+
K_NM = K_all [np.ix_(train_idx,sparse_idx)]
56+
K_solve = K_NM.T @ K_NM
57+
K_solve[np.diag_indices_from(K_solve)] += eta
58+
y_solve = K_NM.T @ y_kf_train
59+
Ks = Ks_all[:,sparse_idx]
60+
61+
alpha = scipy.linalg.solve(K_solve, y_solve, assume_a='pos')
4162
y_kf_predict = np.dot(Ks, alpha)
4263
maes.append(np.mean(np.abs(y_test-y_kf_predict)))
64+
4365
maes_all.append((size_train, np.mean(maes), np.std(maes)))
4466
return maes_all
4567

68+
4669
def main():
4770
import argparse
4871
parser = argparse.ArgumentParser(description='This program computes the learning curve.')
49-
parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file')
50-
parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file')
51-
parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')')
52-
parser.add_argument('--eta', type=float, dest='eta', default=defaults.eta, help='eta hyperparameter (default='+str(defaults.eta)+')')
53-
parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigma, help='sigma hyperparameter (default='+str(defaults.sigma)+')')
54-
parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='local kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
55-
parser.add_argument('--gkernel', type=str, dest='gkernel', default=defaults.gkernel, help='global kernel type (avg for average kernel, rem for REMatch kernel) (default '+str(defaults.gkernel)+')')
56-
parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters')
57-
parser.add_argument('--splits', type=int, dest='splits', default=defaults.n_rep, help='number of splits (default='+str(defaults.n_rep)+')')
58-
parser.add_argument('--train', type=float, dest='train_size', default=defaults.train_size, nargs='+', help='training set fractions')
59-
parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='enable debug')
60-
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
61-
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
72+
parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file')
73+
parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file')
74+
parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')')
75+
parser.add_argument('--eta', type=float, dest='eta', default=defaults.eta, help='eta hyperparameter (default='+str(defaults.eta)+')')
76+
parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigma, help='sigma hyperparameter (default='+str(defaults.sigma)+')')
77+
parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='local kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
78+
parser.add_argument('--gkernel', type=str, dest='gkernel', default=defaults.gkernel, help='global kernel type (avg for average kernel, rem for REMatch kernel) (default '+str(defaults.gkernel)+')')
79+
parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters')
80+
parser.add_argument('--splits', type=int, dest='splits', default=defaults.n_rep, help='number of splits (default='+str(defaults.n_rep)+')')
81+
parser.add_argument('--train', type=float, dest='train_size', default=defaults.train_size, nargs='+', help='training set fractions')
82+
parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='enable debug')
83+
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
84+
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
85+
parser.add_argument('--sparse', type=int, dest='sparse', default=None, help='regression basis size for sparse learning')
86+
parser.add_argument('--random_state', type=int, dest='random_state', default=defaults.random_state, help='random state for test / train splitting')
6287
args = parser.parse_args()
6388
print(vars(args))
6489
if(args.ll): correct_num_threads()
6590
X = np.load(args.repr)
6691
y = np.loadtxt(args.prop)
6792
maes_all = regression(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.akernel,
68-
test_size=args.test_size, train_size=args.train_size, n_rep=args.splits, debug=args.debug)
93+
test_size=args.test_size, train_size=args.train_size, n_rep=args.splits, sparse=args.sparse,
94+
debug=args.debug)
6995
for size_train, meanerr, stderr in maes_all:
7096
print("%d\t%e\t%e" % (size_train, meanerr, stderr))
7197

98+
7299
if __name__ == "__main__":
73100
main()
74-

0 commit comments

Comments
 (0)