Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cross validated results #53

Merged
merged 21 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions qstack/regression/cross_validate_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python3

import sys
import numpy as np
import scipy
from sklearn.model_selection import train_test_split, KFold
from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs
from qstack.regression.hyperparameters import hyperparameters
from qstack.regression.regression import regression
from qstack.tools import correct_num_threads
import qstack.spahm.rho.utils as utils



def cv_results(X, y,
sigmaarr=defaults.sigmaarr, etaarr=defaults.etaarr, gkernel=defaults.gkernel,
gdict=defaults.gdict, akernel=defaults.kernel, test_size=defaults.test_size,
train_size=defaults.train_size, splits=defaults.splits, printlevel=0,
adaptive=False, read_kernel=False, n_rep=defaults.n_rep, save=False,
preffix='unknown', save_pred=False, progress=False, sparse=None,
debug=None):

hyper_runs = []
lc_runs = []
seeds = np.arange(n_rep)
YAY-C marked this conversation as resolved.
Show resolved Hide resolved
if save_pred: predictions_n = []
if progress:
import tqdm
seeds = tqdm.tqdm(seeds)
for seed,n in zip(seeds, range(n_rep)):
error = hyperparameters(X, y, read_kernel=False, sigma=sigmaarr, eta=etaarr,
akernel=akernel, test_size=test_size, splits=splits,
printlevel=printlevel, adaptive=adaptive, random_state=seed,
sparse=sparse)
mae, stdev, eta, sigma = zip(*error)
maes_all = regression(X, y, read_kernel=False, sigma=sigma[-1], eta=eta[-1],
akernel=akernel, test_size=test_size, train_size=train_size,
n_rep=1, debug=debug, save_pred=save_pred, #what about debug ?
sparse=sparse, random_state=seed)
if save_pred:
res, pred = maes_all[1]
maes_all = maes_all[0]
predictions_n.append((res,pred))
ind = np.argsort(error[:,3])
error = error[ind]
ind = np.argsort(error[:,2])
error = error[ind]
hyper_runs.append(error)
lc_runs.append(maes_all)
lc_runs = np.array(lc_runs)
hyper_runs = np.array(hyper_runs, dtype=object)
lc = list(zip(lc_runs[:,:,0].mean(axis=0), lc_runs[:,:,1].mean(axis=0), lc_runs[:,:,1].std(axis=0), lc_runs[:,:,3].mean(axis=0)))
lc = np.array(lc)
if save == True:
np.save(f"{preffix}_{n_rep}-hyper-runs.npy", hyper_runs)
np.save(f"{preffix}_{n_rep}-lc-runs.npy", lc_runs)
if save_pred == True:
np_pred = np.array(predictions_n)
##### Can not take means !!! Test-set varies with run !
##### pred_mean = np.concatenate([np_pred.mean(axis=0),np_pred.std(axis=0)[1].reshape((1,-1))], axis=0)
pred_mean = np.concatenate([*np_pred.reshape((n_rep, 2, -1))], axis=0)
np.savetxt(f"{preffix}_{n_rep}-predictions.txt", pred_mean.T)
return lc


def main():
import argparse
parser = argparse.ArgumentParser(description='This program runs a full cross-validation of the learning curves (hyperparameters search inbcluded).')
parser.add_argument('--x', type=str, dest='repr', required=True, help='path to the representations file')
parser.add_argument('--y', type=str, dest='prop', required=True, help='path to the properties file')
parser.add_argument('--test', type=float, dest='test_size', default=defaults.test_size, help='test set fraction (default='+str(defaults.test_size)+')')
parser.add_argument('--train', type=float, dest='train_size', default=defaults.train_size, nargs='+', help='training set fractions')
parser.add_argument('--akernel', type=str, dest='akernel', default=defaults.kernel, help='local kernel type (G for Gaussian, L for Laplacian, myL for Laplacian for open-shell systems) (default '+defaults.kernel+')')
parser.add_argument('--gkernel', type=str, dest='gkernel', default=defaults.gkernel, help='global kernel type (avg for average kernel, rem for REMatch kernel) (default )')
parser.add_argument('--gdict', nargs='*', action=ParseKwargs, dest='gdict', default=defaults.gdict, help='dictionary like input string to initialize global kernel parameters')
parser.add_argument('--splits', type=int, dest='splits', default=defaults.splits, help='k in k-fold cross validation (default='+str(defaults.n_rep)+')')
parser.add_argument('--n', type=int, dest='n_rep', default=defaults.n_rep, help='k in k-fold cross validation (default='+str(defaults.n_rep)+')')
parser.add_argument('--print', type=int, dest='printlevel', default=0, help='printlevel')
parser.add_argument('--eta', type=float, dest='eta', default=defaults.etaarr, nargs='+', help='eta array')
parser.add_argument('--sigma', type=float, dest='sigma', default=defaults.sigmaarr, nargs='+', help='sigma array')
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
parser.add_argument('--save', action='store_true', dest='save_all', default=False, help='if saving intermediate results in .npy file')
parser.add_argument('--ada', action='store_true', dest='adaptive', default=False, help='if adapt sigma')
parser.add_argument('--save-pred', action='store_true', dest='save_pred', default=False, help='if save test-set prediction')
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
parser.add_argument('--sparse', type=int, dest='sparse', default=None, help='regression basis size for sparse learning')
parser.add_argument('--name', type=str, dest='nameout', required=True, help='the name of the output file')
parser.add_argument('--select', type=str, dest='f_select', required=False, help='a txt file containing the indices of the selected representations')
parser.add_argument('--debug', action='store_true', dest='debug', default=False, help='enable debug')
args = parser.parse_args()
if(args.readk): args.sigma = [np.nan]
if(args.ll): correct_num_threads()

#Removing extensions to save intermediate results with proper filenames
args.nameout = '-'.join(args.nameout.split('.')[:-1])

X = np.load(args.repr)
y = np.loadtxt(args.prop)
if args.f_select != None:
selected = np.loadtxt(args.f_select, dtype=int)
X = X[selected]
y = y[selected]
args.nameout = args.nameout+'_'+args.f_select.split('.')[-2]
print(vars(args))
final = cv_results(X, y, sigmaarr=args.sigma, etaarr=args.eta, akernel=args.akernel,
test_size=args.test_size, splits=args.splits, printlevel=args.printlevel,
adaptive=args.adaptive, train_size=args.train_size, n_rep=args.n_rep,
preffix=args.nameout, save=args.save_all, save_pred=args.save_pred,
sparse=args.sparse, progress=True, debug=args.debug)
print(final)
np.savetxt(args.nameout+'.txt', final)

if __name__ == '__main__' : main()
17 changes: 13 additions & 4 deletions qstack/regression/hyperparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def hyperparameters(X, y,
sigma=defaults.sigmaarr, eta=defaults.etaarr, gkernel=defaults.gkernel, gdict=defaults.gdict,
akernel=defaults.kernel, test_size=defaults.test_size, splits=defaults.splits,
printlevel=0, adaptive=False, read_kernel=False, sparse=None):
printlevel=0, adaptive=False, read_kernel=False, sparse=None, random_state=0):
"""

.. todo::
Expand Down Expand Up @@ -67,9 +67,9 @@ def hyper_loop(sigma, eta):
gwrap = [gkernel, gdict]
kernel = get_kernel(akernel, gwrap)
if read_kernel is False:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
else:
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=0)
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(len(y)), y, test_size=test_size, random_state=random_state)
X_train = X[np.ix_(idx_train,idx_train)]
sigma = [np.nan]

Expand Down Expand Up @@ -127,16 +127,25 @@ def main():
parser.add_argument('--ada', action='store_true', dest='adaptive', default=False, help='if adapt sigma')
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
parser.add_argument('--sparse', type=int, dest='sparse', default=None, help='regression basis size for sparse learning')
parser.add_argument('--name', type=str, dest='nameout', required=False, default=None, help='the name of the output file')
parser.add_argument('--select', type=str, dest='f_select', required=False, help='a txt file containing the indices of the selected representations')
YAY-C marked this conversation as resolved.
Show resolved Hide resolved
args = parser.parse_args()
if(args.readk): args.sigma = [np.nan]
print(vars(args))
if(args.ll): correct_num_threads()

X = np.load(args.repr)
y = np.loadtxt(args.prop)
if args.f_select != None:
selected = np.loadtxt(args.f_select, dtype=int)
X = X[selected]
y = y[selected]

errors = hyperparameters(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.akernel, sparse=args.sparse,
test_size=args.test_size, splits=args.splits, printlevel=args.printlevel, adaptive=args.adaptive)

errors = np.array(errors)
if args.nameout is not None:
np.savetxt(args.nameout, errors, header="error stdev eta sigma")
print()
print('error stdev eta sigma')
for error in errors:
Expand Down
3 changes: 3 additions & 0 deletions qstack/regression/kernel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,9 @@ def get_local_kernel(arg):
return my_laplacian_kernel
elif arg in ['myLfast', 'myG']:
return my_kernel_c(arg)
elif arg=='cosine':
from sklearn.metrics.pairwise import cosine_similarity
return lambda x,y,s: cosine_similarity(x, y)
else:
raise Exception(f'{arg} kernel is not implemented') # TODO

Expand Down
25 changes: 18 additions & 7 deletions qstack/regression/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
import numpy as np
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
YAY-C marked this conversation as resolved.
Show resolved Hide resolved
from qstack.regression.kernel_utils import get_kernel, defaults, ParseKwargs
from qstack.tools import correct_num_threads
from qstack.mathutils.fps import do_fps


def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta,
akernel=defaults.kernel, gkernel=defaults.gkernel, gdict=defaults.gdict,
test_size=defaults.test_size, train_size=defaults.train_size, n_rep=defaults.n_rep,
random_state=defaults.random_state,
sparse=None, debug=False):
sparse=None, debug=False, save_pred=False):
YAY-C marked this conversation as resolved.
Show resolved Hide resolved
"""

.. todo::
Expand All @@ -37,12 +37,14 @@ def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta,
sparse_idx = do_fps(X_train)[0][:sparse]

if debug:
# Ensures reproducibility of the sample selection for each train_size over repetitions (n_rep)
np.random.seed(666)

maes_all = []
for size in train_size:
size_train = int(np.floor(len(y_train)*size)) if size <= 1.0 else size
maes = []
r2_scores = []
for rep in range(n_rep):
train_idx = np.random.choice(all_indices_train, size = size_train, replace=False)
y_kf_train = y_train[train_idx]
Expand All @@ -61,9 +63,9 @@ def regression(X, y, read_kernel=False, sigma=defaults.sigma, eta=defaults.eta,
alpha = scipy.linalg.solve(K_solve, y_solve, assume_a='pos')
y_kf_predict = np.dot(Ks, alpha)
maes.append(np.mean(np.abs(y_test-y_kf_predict)))

maes_all.append((size_train, np.mean(maes), np.std(maes)))
return maes_all
r2_scores.append(r2_score(y_test, y_kf_predict))
maes_all.append((size_train, np.mean(maes), np.std(maes), np.mean(r2_scores)))
return maes_all if not save_pred else (maes_all, (y_test, y_kf_predict))


def main():
Expand All @@ -83,17 +85,26 @@ def main():
parser.add_argument('--ll', action='store_true', dest='ll', default=False, help='if correct for the numper of threads')
parser.add_argument('--readkernel', action='store_true', dest='readk', default=False, help='if X is kernel')
parser.add_argument('--sparse', type=int, dest='sparse', default=None, help='regression basis size for sparse learning')
parser.add_argument('--random_state', type=int, dest='random_state', default=defaults.random_state, help='random state for test / train splitting')
parser.add_argument('--random_state', type=int, dest='random_state', default=defaults.random_state, help='seed for the numpy.random.RandomState for test / train split generator')
parser.add_argument('--select', type=str, dest='f_select', required=False, help='a txt file containing the indices of the selected representations')
parser.add_argument('--name', type=str, dest='nameout', required=False, default=None, help='the name of the output file containting the LC data (.txt).')
args = parser.parse_args()
print(vars(args))
if(args.ll): correct_num_threads()
X = np.load(args.repr)
y = np.loadtxt(args.prop)
if args.f_select != None:
selected = np.loadtxt(args.f_select, dtype=int)
X = X[selected]
y = y[selected]
maes_all = regression(X, y, read_kernel=args.readk, sigma=args.sigma, eta=args.eta, akernel=args.akernel,
test_size=args.test_size, train_size=args.train_size, n_rep=args.splits, sparse=args.sparse,
debug=args.debug)
debug=args.debug, random_state=args.random_state)
for size_train, meanerr, stderr in maes_all:
print("%d\t%e\t%e" % (size_train, meanerr, stderr))
maes_all = np.array(maes_all)
if args.nameout is not None:
np.savetxt(args.nameout, maes_all, header="size_train, meanerr, stderr")


if __name__ == "__main__":
Expand Down
Loading