diff --git a/dft_comparison/dft_comparison_with_GPR.py b/dft_comparison/dft_comparison_with_GPR.py index de17fb9..c0e46fc 100644 --- a/dft_comparison/dft_comparison_with_GPR.py +++ b/dft_comparison/dft_comparison_with_GPR.py @@ -16,15 +16,15 @@ from kernels import Tanimoto -def main(path, path_to_dft_dataset, task, representation, theory_level): +def main(path, path_to_dft_dataset, representation, theory_level): """ :param path: str specifying path to photoswitches.csv file. :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. - :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison. :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] """ + task = 'e_iso_pi' # e_iso_pi only task supported for TD-DFT comparison data_loader = TaskDataLoader(task, path) smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) @@ -145,8 +145,6 @@ def objective_closure(): help='Path to the photoswitches.csv file.') parser.add_argument('-pd', '--path_to_dft_dataset', type=str, default='../dataset/dft_comparison.csv', help='str giving path to dft_comparison.csv file') - parser.add_argument('-t', '--task', type=str, default='e_iso_pi', - help='str specifying the task. e_iso_pi only task supported for the TD-DFT comparison.') parser.add_argument('-r', '--representation', type=str, default='fragprints', help='str specifying the molecular representation. ' 'One of [fingerprints, fragments, fragprints].') @@ -155,4 +153,4 @@ def objective_closure(): args = parser.parse_args() - main(args.path, args.path_to_dft_dataset, args.task, args.representation, args.theory_level) + main(args.path, args.path_to_dft_dataset, args.representation, args.theory_level) diff --git a/dft_comparison/dft_comparison_with_multioutput_GPR.py b/dft_comparison/dft_comparison_with_multioutput_GPR.py new file mode 100644 index 0000000..1170c5c --- /dev/null +++ b/dft_comparison/dft_comparison_with_multioutput_GPR.py @@ -0,0 +1,204 @@ +# Author: Ryan-Rhys Griffiths +""" +Property prediction comparison against DFT error. 99 molecules with DFT-computed values at the CAM-B3LYP level of +theory and 114 molecules with DFT-computed values at the PBE0 level of theory. +""" + +import argparse + +import gpflow +from gpflow.ci_utils import ci_niter +from gpflow.mean_functions import Constant +from gpflow.utilities import print_summary +import numpy as np +from sklearn.metrics import mean_squared_error + +from data_utils import TaskDataLoader, featurise_mols +from kernels import Tanimoto + + +def main(path, path_to_dft_dataset, representation, theory_level): + """ + :param path: str specifying path to photoswitches.csv file. + :param path_to_dft_dataset: str specifying path to dft_comparison.csv file. + :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] + :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0'] + """ + + task = 'e_iso_pi' # e_iso_pi only task supported for TD-DFT comparison + data_loader = TaskDataLoader(task, path) + smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset) + + X = featurise_mols(smiles_list, representation) + + # Keep only non-duplicate entries because we're not considering effects of solvent + + non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) + X = X[non_duplicate_indices, :] + experimental_vals = experimental_vals[non_duplicate_indices] + non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) + non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]]) + pbe0_vals = pbe0_vals[non_dup_pbe0] + cam_vals = cam_vals[non_dup_cam] + + # molecules with dft values to be split into train/test + if theory_level == 'CAM-B3LYP': + X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0) + y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals))) + # DFT values for the CAM-B3LYP level of theory + dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals))) + # molecules with no dft vals must go into the training set. + X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0) + y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals))) + else: + X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0) + y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals))) + # DFT values for the PBE0 level of theory + dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals))) + # molecules with no dft vals must go into the training set. + X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0) + y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals))) + + # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance. + + data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) + data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) + data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) + + smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() + smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() + smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() + + y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) + y_e_iso_n = y_e_iso_n.reshape(-1, 1) + y_z_iso_n = y_z_iso_n.reshape(-1, 1) + + X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) + X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) + X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) + + output_dim = 4 # Number of outputs + rank = 1 # Rank of W + feature_dim = len(X_no_dft[0, :]) + + tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. + + mae_list = [] + dft_mae_list = [] + + # We define the Gaussian Process optimisation objective + + m = None + + def objective_closure(): + return -m.log_marginal_likelihood() + + print('\nBeginning training loop...') + + for i in range(len(y_with_dft)): + + X_train = np.delete(X_with_dft, i, axis=0) + y_train = np.delete(y_with_dft, i) + X_test = X_with_dft[i].reshape(1, -1) + y_test = y_with_dft[i] + dft_test = dft_vals[i] + + X_train = np.concatenate((X_train, X_no_dft)) + y_train = np.concatenate((y_train, y_no_dft)) + y_train = y_train.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + X_train = X_train.astype(np.float64) + X_test = X_test.astype(np.float64) + + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) + X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + y_test = np.hstack((y_test, np.zeros_like(y_test))) + + # Base kernel + k = Tanimoto(active_dims=tanimoto_active_dims) + #set_trainable(k.variance, False) + + # Coregion kernel + coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) + + # Create product kernel + kern = k * coreg + + # This likelihood switches between Gaussian noise with different variances for each f_i: + lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), + gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) + + # now build the GP model as normal + m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) + + # fit the covariance function parameters + maxiter = ci_niter(1000) + gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",) + print_summary(m) + + # Output Standardised RMSE and RMSE on Train Set + + y_pred_train, _ = m.predict_f(X_train) + train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) + print("Train RMSE: {:.3f}".format(train_rmse)) + + # mean and variance GP prediction + + y_pred, y_var = m.predict_f(X_test) + + # Output MAE for this trial + + mae = abs(y_test[:, 0] - y_pred) + + print("MAE: {}".format(mae)) + + # Store values in order to compute the mean and standard error of the statistics across trials + + mae_list.append(mae) + + # DFT prediction scores on the same trial + + dft_mae = abs(y_test[:, 0] - dft_test) + + dft_mae_list.append(dft_mae) + + mae_list = np.array(mae_list) + dft_mae_list = np.array(dft_mae_list) + + print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) + print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list)))) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv', + help='Path to the photoswitches.csv file.') + parser.add_argument('-pd', '--path_to_dft_dataset', type=str, default='../dataset/dft_comparison.csv', + help='str giving path to dft_comparison.csv file') + parser.add_argument('-r', '--representation', type=str, default='fragprints', + help='str specifying the molecular representation. ' + 'One of [fingerprints, fragments, fragprints].') + parser.add_argument('-th', '--theory_level', type=str, default='PBE0', + help='level of theory to compare against - CAM-B3LYP or PBE0 [CAM-B3LYP, PBE0]') + + args = parser.parse_args() + + main(args.path, args.path_to_dft_dataset, args.representation, args.theory_level) diff --git a/dft_comparison/dft_k_fold_comparison_with_GPR.py b/dft_comparison/dft_k_fold_comparison_with_GPR.py index 8ec54cc..c21a5e8 100644 --- a/dft_comparison/dft_k_fold_comparison_with_GPR.py +++ b/dft_comparison/dft_k_fold_comparison_with_GPR.py @@ -73,7 +73,7 @@ def objective_closure(): for i in range(5): - X_train, X_test, y_train, y_test = train_test_split(X_with_dft, y_with_dft, test_size=0.6, random_state=i) + X_train, X_test, y_train, y_test = train_test_split(X_with_dft, y_with_dft, test_size=0.5, random_state=i) X_dud, _, _, dft_test = train_test_split(X_with_dft, dft_vals, test_size=0.6, random_state=i) X_train = np.concatenate((X_train, X_no_dft)) diff --git a/human_comparison/human_performance_comparison.py b/human_comparison/human_performance_comparison.py index 9535d57..59cc3e2 100644 --- a/human_comparison/human_performance_comparison.py +++ b/human_comparison/human_performance_comparison.py @@ -1,7 +1,7 @@ # Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 # Author: Ryan-Rhys Griffiths """ -Script for comparing against human performance on a set of 5 molecules. +Script for comparing against human performance on a set of 5 molecules with Tanimoto GP. """ import argparse @@ -16,13 +16,13 @@ from kernels import Tanimoto -def main(path, task, representation): +def main(path, representation): """ :param path: str specifying path to dataset. - :param task: str specifying the task. Always e_iso_pi in the case of the human performance comparison :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] """ + task = 'e_iso_pi' # Always e_iso_pi for human performance comparison data_loader = TaskDataLoader(task, path) smiles_list, y = data_loader.load_property_data() X = featurise_mols(smiles_list, representation) @@ -112,12 +112,10 @@ def objective_closure(): parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv', help='Path to the photoswitches.csv file.') - parser.add_argument('-t', '--task', type=str, default='e_iso_pi', - help='str specifying the task. Always e_iso_pi in the case of the human performance comparison') parser.add_argument('-r', '--representation', type=str, default='fragprints', help='str specifying the molecular representation. ' 'One of [fingerprints, fragments, fragprints].') args = parser.parse_args() - main(args.path, args.task, args.representation) + main(args.path, args.representation) diff --git a/human_comparison/human_performance_comparison_MOGP.py b/human_comparison/human_performance_comparison_MOGP.py new file mode 100644 index 0000000..2a094a0 --- /dev/null +++ b/human_comparison/human_performance_comparison_MOGP.py @@ -0,0 +1,167 @@ +# Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 +# Author: Ryan-Rhys Griffiths +""" +Script for comparing against human performance on a set of 5 molecules with Tanimoto MOGP. +""" + +import argparse + +import gpflow +from gpflow.ci_utils import ci_niter +from gpflow.mean_functions import Constant +from gpflow.utilities import print_summary +import numpy as np +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error + +from data_utils import transform_data, TaskDataLoader, featurise_mols +from kernels import Tanimoto + + +def main(path, representation): + """ + :param path: str specifying path to dataset. + :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] + """ + + task = 'e_iso_pi' # task always e_iso_pi with human performance comparison + data_loader = TaskDataLoader(task, path) + smiles_list, y = data_loader.load_property_data() + X = featurise_mols(smiles_list, representation) + + # 5 test molecules + + test_smiles = ['BrC1=CC=C(/N=N/C2=CC=CC=C2)C=C1', + 'O=[N+]([O-])C1=CC=C(/N=N/C2=CC=CC=C2)C=C1', + 'CC(C=C1)=CC=C1/N=N/C2=CC=C(N(C)C)C=C2', + 'BrC1=CC([N+]([O-])=O)=CC([N+]([O-])=O)=C1/N=N/C2=CC([H])=C(C=C2[H])N(CC)CC', + 'ClC%11=CC([N+]([O-])=O)=CC(C#N)=C%11/N=N/C%12=CC([H])=C(C=C%12OC)N(CC)CC'] + + # and their indices in the loaded data + test_smiles_indices = [116, 131, 168, 221, 229] + + X_train = np.delete(X, np.array(test_smiles_indices), axis=0) + y_train = np.delete(y, np.array(test_smiles_indices)) + X_test = X[[116, 131, 168, 221, 229]] + + # experimental wavelength values in EtOH. Main csv file has 400nm instead of 407nm because measurement was + # under a different solvent + y_test = y[[116, 131, 168, 221, 229]] + y_test[2] = 407. + + y_train = y_train.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + # # We standardise the outputs but leave the inputs unchanged + # + # _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test) + + X_train = X_train.astype(np.float64) + X_test = X_test.astype(np.float64) + + data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) + data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) + data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) + + smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() + smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() + smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() + + y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) + y_e_iso_n = y_e_iso_n.reshape(-1, 1) + y_z_iso_n = y_z_iso_n.reshape(-1, 1) + + X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) + X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) + X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) + + output_dim = 4 # Number of outputs + rank = 1 # Rank of W + feature_dim = len(X_train[0, :]) + + tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. + + # We define the Gaussian Process Regression Model using the Tanimoto kernel + + m = None + + def objective_closure(): + return -m.log_marginal_likelihood() + + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) + X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + y_test = np.hstack((y_test, np.zeros_like(y_test))) + + # Base kernel + k = Tanimoto(active_dims=tanimoto_active_dims) + # set_trainable(k.variance, False) + + # Coregion kernel + coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) + + # Create product kernel + kern = k * coreg + + # This likelihood switches between Gaussian noise with different variances for each f_i: + lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), + gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) + + # now build the GP model as normal + m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, + likelihood=lik) + + # fit the covariance function parameters + maxiter = ci_niter(1000) + gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), + method="L-BFGS-B", ) + print_summary(m) + + # mean and variance GP prediction + + y_pred, y_var = m.predict_f(X_test) + + # Output Standardised RMSE and RMSE on Train Set + + y_pred_train, _ = m.predict_f(X_train) + train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) + print("Train RMSE: {:.3f}".format(train_rmse)) + + r2 = r2_score(y_test[:, 0], y_pred) + rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) + mae = mean_absolute_error(y_test[:, 0], y_pred) + per_molecule = np.diag(abs(y_pred - y_test[:, 0])) + + print("\n Averaged test statistics are") + print("\nR^2: {:.3f}".format(r2)) + print("RMSE: {:.3f}".format(rmse)) + print("MAE: {:.3f}".format(mae)) + print("\nAbsolute error per molecule is {} ".format(per_molecule)) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv', + help='Path to the photoswitches.csv file.') + parser.add_argument('-r', '--representation', type=str, default='fragprints', + help='str specifying the molecular representation. ' + 'One of [fingerprints, fragments, fragprints].') + + args = parser.parse_args() + + main(args.path, args.representation) \ No newline at end of file diff --git a/hyperparameter_tuning/rf_tuning.py b/hyperparameter_tuning/rf_tuning.py index d70f1b3..c60a7bd 100644 --- a/hyperparameter_tuning/rf_tuning.py +++ b/hyperparameter_tuning/rf_tuning.py @@ -42,6 +42,8 @@ def main(path, task, representation, use_pca): estim = HyperoptEstimator(regressor=random_forest_regression('my_RF')) estim.fit(X_train, y_train, valid_size=0.1, n_folds=5, cv_shuffle=True) print(estim.best_model()) + with open(f'saved_hypers/RF/tuning_for_{task}', 'w') as f: + print(estim.best_model(), file=f) if __name__ == '__main__': @@ -50,7 +52,7 @@ def main(path, task, representation, use_pca): parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv', help='Path to the photoswitches.csv file.') - parser.add_argument('-t', '--task', type=str, default='z_iso_n', + parser.add_argument('-t', '--task', type=str, default='e_iso_n', help='str specifying the task. One of [e_iso_pi, z_iso_pi, e_iso_n, z_iso_n].') parser.add_argument('-r', '--representation', type=str, default='fragprints', help='str specifying the molecular representation. ' diff --git a/property_prediction/data_utils.py b/property_prediction/data_utils.py index 925adb1..53c1aba 100644 --- a/property_prediction/data_utils.py +++ b/property_prediction/data_utils.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd -from rdkit.Chem import AllChem, Descriptors, MolFromSmiles +from rdkit.Chem import AllChem, Descriptors, MolFromSmiles, MolToSmiles from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler @@ -166,6 +166,7 @@ def featurise_mols(smiles_list, representation, bond_radius=3, nBits=2048): # descList[115:] contains fragment-based features only # (https://www.rdkit.org/docs/source/rdkit.Chem.Fragments.html) + # NB latest version of RDKit has 7 more features that change indexing. fragments = {d[0]: d[1] for d in Descriptors.descList[115:]} X = np.zeros((len(smiles_list), len(fragments))) @@ -181,7 +182,11 @@ def featurise_mols(smiles_list, representation, bond_radius=3, nBits=2048): # fragprints + # convert to mol and back to smiles in order to make non-isomeric. + rdkit_mols = [MolFromSmiles(smiles) for smiles in smiles_list] + rdkit_smiles = [MolToSmiles(mol, isomericSmiles=False) for mol in rdkit_mols] + rdkit_mols = [MolFromSmiles(smiles) for smiles in rdkit_smiles] X = [AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=2048) for mol in rdkit_mols] X = np.asarray(X) diff --git a/property_prediction/filter_new_candidates.py b/property_prediction/filter_new_candidates.py new file mode 100644 index 0000000..c184e4f --- /dev/null +++ b/property_prediction/filter_new_candidates.py @@ -0,0 +1,66 @@ +# Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 +# Author: Ryan-Rhys Griffiths +""" +Script for filtering purchasable candidates. Criterion for new candidates is that the E isomer pi-pi* +value be between 450-600nm. The separation between the E isomer n-pi* and Z isomer n-pi* is not less than 15nm. +The separation between E isomer pi-pi* and Z isomer pi-pi* is greater than 40nm. +""" + +import numpy as np + +if __name__ == '__main__': + + # Load property predictions for the purchasable molecules for a given model. + + model = 'multioutput_gp' # ['gp', 'rf', 'ensemble', 'multioutput_gp'] + + data_ep_mgp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_e_iso_pi.txt') + data_en_mgp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_e_iso_n.txt') + data_zn_mgp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_z_iso_n.txt') + data_zp_mgp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_z_iso_pi.txt') + + model = 'gp' # ['gp', 'rf', 'ensemble', 'multioutput_gp'] + + data_ep_gp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_e_iso_pi.txt') + data_en_gp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_e_iso_n.txt') + data_zn_gp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_z_iso_n.txt') + data_zp_gp = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_z_iso_pi.txt') + + model = 'rf' # ['gp', 'rf', 'ensemble', 'multioutput_gp'] + + data_ep_rf = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_e_iso_pi.txt') + data_en_rf = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_e_iso_n.txt') + data_zn_rf = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_z_iso_n.txt') + data_zp_rf = np.loadtxt(f'../property_prediction/predictions/purchasable_{model}_task_z_iso_pi.txt') + + data_ep = (data_ep_gp + data_ep_rf + data_ep_mgp)/3.0 + data_en = (data_en_gp + data_en_rf + data_en_mgp)/3.0 + data_zp = (data_zp_gp + data_zp_rf + data_zp_mgp)/3.0 + data_zn = (data_zn_gp + data_zn_rf + data_zn_mgp)/3.0 + + # Apply the first criterion + + model = 'triple_ensemble' + + data_filtered_crit_one = np.where(data_ep > 450) + data_filtered_crit_two = np.abs(data_en - data_zn) + data_filtered_crit_two = np.where(data_filtered_crit_two < 15) + + # Collect the indices of molecules that satisfy criteria 1 and 2 + + satisfy_two = np.intersect1d(data_filtered_crit_one, data_filtered_crit_two) + + data_filtered_crit_three = np.abs(data_ep - data_zp) + data_filtered_crit_three = np.where(data_filtered_crit_three > 40) + + # Collect the indices of molecules that satisfy criteria 1, 2 and 3. + + satisfy_three = np.intersect1d(satisfy_two, data_filtered_crit_three) + + with open(f'predictions/predicted_smiles/{model}.txt', 'w') as file_to_write: + with open('../dataset/purchasable_switch.csv', 'r') as file_to_read: + i = -1 + for line in file_to_read: + if i in satisfy_three: + file_to_write.write(line) + i += 1 diff --git a/property_prediction/inference_on_new_candidates.py b/property_prediction/inference_on_new_candidates.py index 348a32c..c0397c1 100644 --- a/property_prediction/inference_on_new_candidates.py +++ b/property_prediction/inference_on_new_candidates.py @@ -1,14 +1,18 @@ # Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 # Author: Ryan-Rhys Griffiths """ -Script for performing inference on new candidates. +Script for performing inference on new candidates. Criterion for new candidates is that the E isomer pi-pi* +value be between 450-600nm. The separation between the E isomer n-pi* and Z isomer n-pi* is not less than 15nm. +The separation between E isomer pi-pi* and Z isomer pi-pi* is greater than 40nm. """ import gpflow from gpflow.mean_functions import Constant from gpflow.utilities import print_summary import numpy as np +import pandas as pd from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error from sklearn.preprocessing import StandardScaler from data_utils import TaskDataLoader, featurise_mols @@ -20,10 +24,14 @@ # New candidates to predict wavelength values for -candidate_list = ['O=C(OC)C(C=C1)=CC=C1C2=C[N-][N+]3=C(C=CN32)/N=N/C4=CC=CC=C4', - 'O=C(OC)C(C=C1)=CC=C1C2=CN3[N+]([N-]2)=CC=C3/N=N/C4=CC=CC=C4'] +# candidate_list = ['O=C(OC)C(C=C1)=CC=C1C2=C[N-][N+]3=C(C=CN32)/N=N/C4=CC=CC=C4', +# 'O=C(OC)C(C=C1)=CC=C1C2=CN3[N+]([N-]2)=CC=C3/N=N/C4=CC=CC=C4'] + +df = pd.read_csv('../dataset/purchasable_switch.csv') +candidate_list = df['SMILES'].to_list() if __name__ == '__main__': + data_loader = TaskDataLoader(task, path) smiles_list, y_train = data_loader.load_property_data() X_train = featurise_mols(smiles_list, representation) @@ -38,7 +46,7 @@ def objective_closure(): return -m.log_marginal_likelihood() - # We standardise the outputs but leave the inputs unchanged + # We standardise the outputs but leave the inputs unchanged. Equivalent to transform data used in other scripts. y_train = y_train.reshape(-1, 1) y_scaler = StandardScaler() @@ -64,6 +72,11 @@ def objective_closure(): y_pred = y_scaler.inverse_transform(y_pred) y_var = y_scaler.inverse_transform(y_var) + y_pred_train, _ = m.predict_f(X_train) + train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) + train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) + print("Train RMSE: {:.3f}".format(train_rmse)) + print(f'GP {representation} prediction is ') print(y_pred) print(f'GP {representation} variance is') @@ -83,3 +96,11 @@ def objective_closure(): print(f'RF {representation} prediction is ') print(y_pred_rf) + + # Ensembled prediction + + y_pred_av = (y_pred_rf.reshape(-1, 1) + y_pred.reshape(-1, 1)) / 2.0 + + np.savetxt(f'predictions/purchasable_gp_task_{task}.txt', y_pred) + np.savetxt(f'predictions/purchasable_rf_task_{task}.txt', y_pred_rf) + np.savetxt(f'predictions/purchasable_ensemble_task_{task}.txt', y_pred_av) diff --git a/property_prediction/inference_on_new_candidates_multioutput.py b/property_prediction/inference_on_new_candidates_multioutput.py new file mode 100644 index 0000000..9f22e8b --- /dev/null +++ b/property_prediction/inference_on_new_candidates_multioutput.py @@ -0,0 +1,175 @@ +# Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 +# Author: Ryan-Rhys Griffiths +""" +Script for performing inference on new candidates. Criterion for new candidates is that the E isomer pi-pi* +value be between 450-600nm. The separation between the E isomer n-pi* and Z isomer n-pi* is not less than 15nm. +The separation between E isomer pi-pi* and Z isomer pi-pi* is greater than 40nm. +""" + +import gpflow +from gpflow.ci_utils import ci_niter +from gpflow.mean_functions import Constant +from gpflow.utilities import print_summary +import numpy as np +import pandas as pd +from sklearn.metrics import mean_squared_error + +from data_utils import TaskDataLoader, featurise_mols +from kernels import Tanimoto + +representation = 'fragprints' +task = 'e_iso_pi' +path = '../dataset/photoswitches.csv' +df = pd.read_csv('../dataset/purchasable_switch.csv') +candidate_list = df['SMILES'].to_list() + +if __name__ == '__main__': + + X_test = featurise_mols(candidate_list, representation) + + data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path) + data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) + data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) + data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) + + smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data() + smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() + smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() + smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() + + y_e_iso_pi = y_e_iso_pi.reshape(-1, 1) + y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) + y_e_iso_n = y_e_iso_n.reshape(-1, 1) + y_z_iso_n = y_z_iso_n.reshape(-1, 1) + + X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation) + X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) + X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) + X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) + + output_dim = 4 # Number of outputs + rank = 1 # Rank of W + feature_dim = len(X_e_iso_pi[0, :]) + + tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. + + if task == 'e_iso_pi': + X_train = X_e_iso_pi + y_train = y_e_iso_pi + elif task == 'z_iso_pi': + X_train = X_z_iso_pi + y_train = y_z_iso_pi + elif task == 'e_iso_n': + X_train = X_e_iso_n + y_train = y_e_iso_n + else: + X_train = X_z_iso_n + y_train = y_z_iso_n + + if task == 'e_iso_pi': + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) + X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + elif task == 'z_iso_pi': + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), + np.append(X_train, np.ones((len(X_train), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1) + X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), + np.hstack((y_train, np.ones_like(y_train))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + elif task == 'e_iso_n': + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1) + X_train = np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_train, np.ones_like(y_train) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + else: + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1) + X_train = np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1) + + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_train, np.ones_like(y_train) * 3)))) + + # Fit GP + + # Base kernel + k = Tanimoto(active_dims=tanimoto_active_dims) + # set_trainable(k.variance, False) + + # Coregion kernel + coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) + + # Create product kernel + kern = k * coreg + + # This likelihood switches between Gaussian noise with different variances for each f_i: + lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), + gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) + + # now build the GP model as normal + m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, + likelihood=lik) + + # fit the covariance function parameters + maxiter = ci_niter(1000) + gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), + method="L-BFGS-B", ) + print_summary(m) + + # mean and variance GP prediction + + y_pred, y_var = m.predict_f(X_test) + + # Output Standardised RMSE and RMSE on Train Set + + y_pred_train, _ = m.predict_f(X_train) + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + print("Train RMSE: {:.3f}".format(train_rmse)) + + print(f'GP {representation} prediction is ') + print(y_pred) + print(f'GP {representation} variance is') + print(y_var) + + np.savetxt(f'predictions/purchasable_multioutput_gp_task_{task}.txt', y_pred) diff --git a/property_prediction/kernels.py b/property_prediction/kernels.py index 17d3df0..f370424 100644 --- a/property_prediction/kernels.py +++ b/property_prediction/kernels.py @@ -12,8 +12,16 @@ class Tanimoto(gpflow.kernels.Kernel): - def __init__(self): - super().__init__() + def __init__(self, **kwargs): + """ + :param kwargs: accepts `name` and `active_dims`, which is a list or + slice of indices which controls which columns of X are used (by + default, all columns are used). + """ + for kwarg in kwargs: + if kwarg not in {"name", "active_dims"}: + raise TypeError("Unknown keyword argument:", kwarg) + super().__init__(**kwargs) self.variance = gpflow.Parameter(1.0, transform=positive()) def K(self, X, X2=None): diff --git a/property_prediction/plot_box.py b/property_prediction/plot_box.py new file mode 100644 index 0000000..3d89b01 --- /dev/null +++ b/property_prediction/plot_box.py @@ -0,0 +1,67 @@ +# Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2021 +# Author: Ryan-Rhys Griffiths +""" +Script for plotting marginal box plots. +""" + +import plotly.graph_objects as go + +if __name__ == '__main__': + + fig = go.Figure() + + # Defining x axis + x = ['$\mathrm{\LARGE{E-Isomer}} \: \LARGE{\pi - \pi^*}$', '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{\pi - \pi^*}$', + '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{\pi - \pi^*}$', '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{\pi - \pi^*}$', + '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{n - \pi^*}$', '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{n - \pi^*}$', + '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{n - \pi^*}$', '$\mathrm{\LARGE{E-Isomer}} \: \LARGE{n - \pi^*}$', + '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{\pi - \pi^*}$', '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{\pi - \pi^*}$', + '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{\pi - \pi^*}$', '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{\pi - \pi^*}$', + '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{n - \pi^*}$', '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{n - \pi^*}$', + '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{n - \pi^*}$', '$\mathrm{\LARGE{Z-Isomer}} \: \LARGE{n - \pi^*}$'] + + fig.add_trace(go.Box( + + # defining y axis in corresponding + # to x-axis + y=[16.4, 17.3, 17.2, 17.4, 8.5, 8.6, 8.9, 9.4, 12.2, 11.5, 11.9, 12.3, 9.0, 8.2, 8.5, 8.9], + x=x, + name='Fragments', + marker_color='paleturquoise', + boxpoints='outliers' + )) + + fig.add_trace(go.Box( + y=[15.5, 15.2, 14.4, 17.9, 7.3, 8.4, 8.5, 10.1, 10.1, 9.8, 9.6, 10.0, 6.6, 6.9, 6.9, 7.2], + x=x, + name='Morgan', + marker_color='darksalmon', + boxpoints='outliers' + )) + + fig.add_trace(go.Box( + y=[13.9, 13.3, 13.1, 18.1, 7.7, 8.2, 8.3, 8.6, 10.0, 9.8, 8.8, 10.4, 6.8, 7.1, 7.1, 7.0], + x=x, + name='Fragprints', + marker_color='sienna', + boxpoints='outliers' + )) + + fig.update_layout( + + # group together boxes of the different + # traces for each value of x + yaxis_title="MAE (nm)", + font=dict( + family="roman", + size=40), + boxmode='group', + legend=dict(font=dict(family="roman", size=50, color="black")), + boxgap=0.2, + boxgroupgap=0.1, + yaxis=dict(tickfont=dict(size=30)), + xaxis=dict(tickfont=dict(size=50) + ) + ) + fig.update_xaxes(title_font_family="Arial") + fig.show() diff --git a/property_prediction/predict_with_GPR.py b/property_prediction/predict_with_GPR.py index 3ab608b..393916c 100644 --- a/property_prediction/predict_with_GPR.py +++ b/property_prediction/predict_with_GPR.py @@ -82,7 +82,7 @@ def objective_closure(): # Optimise the kernel variance and noise level by the marginal likelihood opt = gpflow.optimizers.Scipy() - opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) + opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=10000)) print_summary(m) # mean and variance GP prediction @@ -197,7 +197,7 @@ def objective_closure(): help='Path to the photoswitches.csv file.') parser.add_argument('-t', '--task', type=str, default='e_iso_pi', help='str specifying the task. One of [e_iso_pi, z_iso_pi, e_iso_n, z_iso_n].') - parser.add_argument('-r', '--representation', type=str, default='fragprints', + parser.add_argument('-r', '--representation', type=str, default='fragments', help='str specifying the molecular representation. ' 'One of [fingerprints, fragments, fragprints].') parser.add_argument('-pca', '--use_pca', type=bool, default=False, diff --git a/property_prediction/predict_with_ensemble.py b/property_prediction/predict_with_ensemble.py new file mode 100644 index 0000000..eebf920 --- /dev/null +++ b/property_prediction/predict_with_ensemble.py @@ -0,0 +1,170 @@ +# Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 +# Author: Ryan-Rhys Griffiths +""" +Script for training an ensemble model to predict properties in the photoswitch dataset using GP+RF ensemble. +""" + +import argparse + +import gpflow +from gpflow.mean_functions import Constant +from gpflow.utilities import print_summary +from matplotlib import pyplot as plt +import numpy as np +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error + +from data_utils import transform_data, TaskDataLoader, featurise_mols +from kernels import Tanimoto + + +def main(path, task, representation, use_pca, n_trials, test_set_size, use_rmse_conf): + """ + :param path: str specifying path to dataset. + :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] + :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] + :param use_pca: bool. If True apply PCA to perform Principal Components Regression. + :param n_trials: int specifying number of random train/test splits to use + :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set + :param use_rmse_conf: bool specifying whether to compute the rmse confidence-error curves or the mae confidence- + error curves. True is the option for rmse. + """ + + data_loader = TaskDataLoader(task, path) + smiles_list, y = data_loader.load_property_data() + X = featurise_mols(smiles_list, representation) + + # If True we perform Principal Components Regression + + if use_pca: + n_components = 100 + else: + n_components = None + + # We define the Gaussian Process Regression Model using the Tanimoto kernel + + m = None + + def objective_closure(): + return -m.log_marginal_likelihood() + + r2_list = [] + rmse_list = [] + mae_list = [] + + # We pre-allocate arrays for plotting confidence-error curves + + _, _, _, y_test = train_test_split(X, y, test_size=test_set_size) # To get test set size + n_test = len(y_test) + + rmse_confidence_list = np.zeros((n_trials, n_test)) + mae_confidence_list = np.zeros((n_trials, n_test)) + + print('\nBeginning training loop...') + + for i in range(0, n_trials): + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=i) + + y_train = y_train.reshape(-1, 1) + y_test = y_test.reshape(-1, 1) + + # We standardise the outputs but leave the inputs unchanged + + _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test, n_components=n_components, use_pca=use_pca) + + X_train = X_train.astype(np.float64) + X_test = X_test.astype(np.float64) + + k = Tanimoto() + m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1) + + # e_iso_pi best params: + # {'learner': RandomForestRegressor(max_features=0.9348473830061558, n_estimators=381, + # n_jobs=1, random_state=2, verbose=False)} + # e_iso_n best params: + # {'learner': RandomForestRegressor(bootstrap=False, max_features=0.09944870853556087, + # min_samples_leaf=3, n_estimators=1295, n_jobs=1, + # random_state=0, verbose=False)} + # z_iso_pi best params: + # {'learner': RandomForestRegressor(max_depth=4, max_features=0.33072121415416944, + # n_estimators=2755, n_jobs=1, random_state=2, + # verbose=False)} + # z_iso_n best params: + # {'learner': RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, + # random_state=3, verbose=False)} + + regr_rf = RandomForestRegressor(max_features=None, n_estimators=892, n_jobs=1, + random_state=3, verbose=False) + regr_rf.fit(X_train, y_train) + + # Optimise the kernel variance and noise level by the marginal likelihood + + opt = gpflow.optimizers.Scipy() + opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100)) + print_summary(m) + + # mean and variance GP prediction and RF prediction + + y_pred, y_var = m.predict_f(X_test) + y_pred_rf = regr_rf.predict(X_test) + y_pred_av = (y_pred + y_pred_rf.reshape(-1, 1)) / 2.0 + y_pred = y_scaler.inverse_transform(y_pred_av) + y_test = y_scaler.inverse_transform(y_test) + + # Output Standardised RMSE and RMSE on Train Set + + y_pred_train, _ = m.predict_f(X_train) + y_pred_train_rf = regr_rf.predict(X_train) + y_pred_train = (y_pred_train + y_pred_train_rf.reshape(-1, 1)) / 2.0 + train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) + train_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(y_train), y_scaler.inverse_transform(y_pred_train))) + print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) + print("Train RMSE: {:.3f}".format(train_rmse)) + + score = r2_score(y_test, y_pred) + rmse = np.sqrt(mean_squared_error(y_test, y_pred)) + mae = mean_absolute_error(y_test, y_pred) + + print("\nR^2: {:.3f}".format(score)) + print("RMSE: {:.3f}".format(rmse)) + print("MAE: {:.3f}".format(mae)) + + r2_list.append(score) + rmse_list.append(rmse) + mae_list.append(mae) + + r2_list = np.array(r2_list) + rmse_list = np.array(rmse_list) + mae_list = np.array(mae_list) + + print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) + print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) + print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv', + help='Path to the photoswitches.csv file.') + parser.add_argument('-t', '--task', type=str, default='z_iso_n', + help='str specifying the task. One of [e_iso_pi, z_iso_pi, e_iso_n, z_iso_n].') + parser.add_argument('-r', '--representation', type=str, default='fragprints', + help='str specifying the molecular representation. ' + 'One of [fingerprints, fragments, fragprints].') + parser.add_argument('-pca', '--use_pca', type=bool, default=False, + help='If True apply PCA to perform Principal Components Regression.') + parser.add_argument('-n', '--n_trials', type=int, default=20, + help='int specifying number of random train/test splits to use') + parser.add_argument('-ts', '--test_set_size', type=float, default=0.2, + help='float in range [0, 1] specifying fraction of dataset to use as test set') + parser.add_argument('-rms', '--use_rmse_conf', type=bool, default=True, + help='bool specifying whether to compute the rmse confidence-error curves or the mae ' + 'confidence-error curves. True is the option for rmse.') + + args = parser.parse_args() + + main(args.path, args.task, args.representation, args.use_pca, args.n_trials, args.test_set_size, args.use_rmse_conf) diff --git a/property_prediction/predict_with_multioutput_GPR.py b/property_prediction/predict_with_multioutput_GPR.py new file mode 100644 index 0000000..88ea4da --- /dev/null +++ b/property_prediction/predict_with_multioutput_GPR.py @@ -0,0 +1,245 @@ +# Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020 +# Author: Ryan-Rhys Griffiths +""" +Script for training a model to predict properties in the photoswitch dataset using multioutput +Gaussian Process Regression with the intrinsic model of coregionalisation (ICM) (Bonilla and Williams, 2008). +""" + +import argparse + +import gpflow +from gpflow.ci_utils import ci_niter +from gpflow.mean_functions import Constant +from gpflow.utilities import print_summary, set_trainable +from matplotlib import pyplot as plt +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error + +from data_utils import TaskDataLoader, featurise_mols +from kernels import Tanimoto + + +def main(path, task, representation, use_pca, n_trials, test_set_size): + """ + Train a multioutput GP simultaneously on all tasks of the photoswitch dataset. + + :param path: str specifying path to dataset. + :param task: str specifying the task. One of ['e_iso_pi', 'z_iso_pi', 'e_iso_n', 'z_iso_n'] + :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints'] + :param use_pca: bool. If True apply PCA to perform Principal Components Regression. + :param n_trials: int specifying number of random train/test splits to use + :param test_set_size: float in range [0, 1] specifying fraction of dataset to use as test set + """ + + # If True we perform Principal Components Regression + + if use_pca: + n_components = 100 + else: + n_components = None + + data_loader_e_iso_pi = TaskDataLoader('e_iso_pi', path) + data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path) + data_loader_e_iso_n = TaskDataLoader('e_iso_n', path) + data_loader_z_iso_n = TaskDataLoader('z_iso_n', path) + + smiles_list_e_iso_pi, y_e_iso_pi = data_loader_e_iso_pi.load_property_data() + smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data() + smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data() + smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data() + + y_e_iso_pi = y_e_iso_pi.reshape(-1, 1) + y_z_iso_pi = y_z_iso_pi.reshape(-1, 1) + y_e_iso_n = y_e_iso_n.reshape(-1, 1) + y_z_iso_n = y_z_iso_n.reshape(-1, 1) + + X_e_iso_pi = featurise_mols(smiles_list_e_iso_pi, representation) + X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation) + X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation) + X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation) + + output_dim = 4 # Number of outputs + rank = 1 # Rank of W + feature_dim = len(X_e_iso_pi[0, :]) + + tanimoto_active_dims = [i for i in range(feature_dim)] # active dims for Tanimoto base kernel. + + r2_list = [] + rmse_list = [] + mae_list = [] + + print('\nBeginning training loop...') + + for i in range(0, n_trials): + + if task == 'e_iso_pi': + X_task = X_e_iso_pi + y_task = y_e_iso_pi + elif task == 'z_iso_pi': + X_task = X_z_iso_pi + y_task = y_z_iso_pi + elif task == 'e_iso_n': + X_task = X_e_iso_n + y_task = y_e_iso_n + else: + X_task = X_z_iso_n + y_task = y_z_iso_n + + X_train, X_test, y_train, y_test = train_test_split(X_task, y_task, test_size=test_set_size, random_state=i) + + X_train = X_train.astype(np.float64) + X_test = X_test.astype(np.float64) + + if task == 'e_iso_pi': + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1) + X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + y_test = np.hstack((y_test, np.zeros_like(y_test))) + + elif task == 'z_iso_pi': + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), + np.append(X_train, np.ones((len(X_train), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.ones((len(X_test), 1)), axis=1) + X_train = np.append(X_train, np.ones((len(X_train), 1)), axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), + np.hstack((y_train, np.ones_like(y_train))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + y_test = np.hstack((y_test, np.ones_like(y_test))) + + elif task == 'e_iso_n': + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1), + np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.ones((len(X_test), 1)) * 2, axis=1) + X_train = np.append(X_train, np.ones((len(X_train), 1)) * 2, axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_train, np.ones_like(y_train) * 2)), + np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3)))) + + y_test = np.hstack((y_test, np.ones_like(y_test) * 2)) + + else: + # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension + X_augmented = np.vstack((np.append(X_e_iso_pi, np.zeros((len(X_e_iso_pi), 1)), axis=1), + np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1), + np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1), + np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1))) + + X_test = np.append(X_test, np.ones((len(X_test), 1)) * 3, axis=1) + X_train = np.append(X_train, np.ones((len(X_train), 1)) * 3, axis=1) + + # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods + Y_augmented = np.vstack((np.hstack((y_e_iso_pi, np.zeros_like(y_e_iso_pi))), + np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))), + np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)), + np.hstack((y_train, np.ones_like(y_train) * 3)))) + + y_test = np.hstack((y_test, np.ones_like(y_test) * 3)) + + # Base kernel + k = Tanimoto(active_dims=tanimoto_active_dims) + #set_trainable(k.variance, False) + + # Coregion kernel + coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim]) + + # Create product kernel + kern = k * coreg + + # This likelihood switches between Gaussian noise with different variances for each f_i: + lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(), + gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()]) + + # now build the GP model as normal + m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik) + + # fit the covariance function parameters + maxiter = ci_niter(1000) + gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",) + print_summary(m) + + # mean and variance GP prediction + + y_pred, y_var = m.predict_f(X_test) + + # Output Standardised RMSE and RMSE on Train Set + + y_pred_train, _ = m.predict_f(X_train) + train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train)) + train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) + print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan)) + print("Train RMSE: {:.3f}".format(train_rmse)) + + score = r2_score(y_test[:, 0], y_pred) + rmse = np.sqrt(mean_squared_error(y_test[:, 0], y_pred)) + mae = mean_absolute_error(y_test[:, 0], y_pred) + + print("\nR^2: {:.3f}".format(score)) + print("RMSE: {:.3f}".format(rmse)) + print("MAE: {:.3f}".format(mae)) + + r2_list.append(score) + rmse_list.append(rmse) + mae_list.append(mae) + + B = coreg.output_covariance().numpy() + print("B =", B) + _ = plt.imshow(B) + + r2_list = np.array(r2_list) + rmse_list = np.array(rmse_list) + mae_list = np.array(mae_list) + + print("\nmean R^2: {:.4f} +- {:.4f}".format(np.mean(r2_list), np.std(r2_list)/np.sqrt(len(r2_list)))) + print("mean RMSE: {:.4f} +- {:.4f}".format(np.mean(rmse_list), np.std(rmse_list)/np.sqrt(len(rmse_list)))) + print("mean MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list)))) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv', + help='Path to the photoswitches.csv file.') + parser.add_argument('-t', '--task', type=str, default='z_iso_n', + help='str specifying the task. One of [e_iso_pi, z_iso_pi, e_iso_n, z_iso_n].') + parser.add_argument('-r', '--representation', type=str, default='fragprints', + help='str specifying the molecular representation. ' + 'One of [fingerprints, fragments, fragprints].') + parser.add_argument('-pca', '--use_pca', type=bool, default=False, + help='If True apply PCA to perform Principal Components Regression.') + parser.add_argument('-n', '--n_trials', type=int, default=20, + help='int specifying number of random train/test splits to use') + parser.add_argument('-ts', '--test_set_size', type=float, default=0.2, + help='float in range [0, 1] specifying fraction of dataset to use as test set') + + args = parser.parse_args() + + main(args.path, args.task, args.representation, args.use_pca, args.n_trials, args.test_set_size) diff --git a/property_prediction/predict_with_sklearn_gpr.py b/property_prediction/predict_with_sklearn_gpr.py index 3653794..021c59f 100644 --- a/property_prediction/predict_with_sklearn_gpr.py +++ b/property_prediction/predict_with_sklearn_gpr.py @@ -61,7 +61,15 @@ def main(path, task, representation, use_pca, n_trials, test_set_size): # mean GP prediction + X_test = np.tile(X_test, (10000, 1)) + + import time + start = time.time() + y_pred = gpr.predict(X_test, return_std=False) + + end = time.time() + print(f'time elapsed is {end - start}') y_pred = y_scaler.inverse_transform(y_pred) y_test = y_scaler.inverse_transform(y_test)