Merge pull request #39 from Ryan-Rhys/ryan/dev

Ryan/dev
Ryan-Rhys · Aug 28, 2021 · 8af7e60 · 8af7e60
2 parents 2296f1e + 95010f9
commit 8af7e60
Show file tree

Hide file tree

Showing 16 changed files with 1,156 additions and 22 deletions.
diff --git a/dft_comparison/dft_comparison_with_GPR.py b/dft_comparison/dft_comparison_with_GPR.py
@@ -16,15 +16,15 @@
 from kernels import Tanimoto
 
 
-def main(path, path_to_dft_dataset, task, representation, theory_level):
+def main(path, path_to_dft_dataset, representation, theory_level):
     """
     :param path: str specifying path to photoswitches.csv file.
     :param path_to_dft_dataset: str specifying path to dft_comparison.csv file.
-    :param task: str specifying the task. e_iso_pi only supported task for the TD-DFT comparison.
     :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
     :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0']
     """
 
+    task = 'e_iso_pi'  # e_iso_pi only task supported for TD-DFT comparison
     data_loader = TaskDataLoader(task, path)
     smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset)
 
@@ -145,8 +145,6 @@ def objective_closure():
                         help='Path to the photoswitches.csv file.')
     parser.add_argument('-pd', '--path_to_dft_dataset', type=str, default='../dataset/dft_comparison.csv',
                         help='str giving path to dft_comparison.csv file')
-    parser.add_argument('-t', '--task', type=str, default='e_iso_pi',
-                        help='str specifying the task. e_iso_pi only task supported for the TD-DFT comparison.')
     parser.add_argument('-r', '--representation', type=str, default='fragprints',
                         help='str specifying the molecular representation. '
                              'One of [fingerprints, fragments, fragprints].')
@@ -155,4 +153,4 @@ def objective_closure():
 
     args = parser.parse_args()
 
-    main(args.path, args.path_to_dft_dataset, args.task, args.representation, args.theory_level)
+    main(args.path, args.path_to_dft_dataset, args.representation, args.theory_level)
diff --git a/dft_comparison/dft_comparison_with_multioutput_GPR.py b/dft_comparison/dft_comparison_with_multioutput_GPR.py
@@ -0,0 +1,204 @@
+# Author: Ryan-Rhys Griffiths
+"""
+Property prediction comparison against DFT error. 99 molecules with DFT-computed values at the CAM-B3LYP level of
+theory and 114 molecules with DFT-computed values at the PBE0 level of theory.
+"""
+
+import argparse
+
+import gpflow
+from gpflow.ci_utils import ci_niter
+from gpflow.mean_functions import Constant
+from gpflow.utilities import print_summary
+import numpy as np
+from sklearn.metrics import mean_squared_error
+
+from data_utils import TaskDataLoader, featurise_mols
+from kernels import Tanimoto
+
+
+def main(path, path_to_dft_dataset, representation, theory_level):
+    """
+    :param path: str specifying path to photoswitches.csv file.
+    :param path_to_dft_dataset: str specifying path to dft_comparison.csv file.
+    :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
+    :param theory_level: str giving the level of theory to compare against - CAM-B3LYP or PBE0 ['CAM-B3LYP', 'PBE0']
+    """
+
+    task = 'e_iso_pi'  # e_iso_pi only task supported for TD-DFT comparison
+    data_loader = TaskDataLoader(task, path)
+    smiles_list, _, pbe0_vals, cam_vals, experimental_vals = data_loader.load_dft_comparison_data(path_to_dft_dataset)
+
+    X = featurise_mols(smiles_list, representation)
+
+    # Keep only non-duplicate entries because we're not considering effects of solvent
+
+    non_duplicate_indices = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
+    X = X[non_duplicate_indices, :]
+    experimental_vals = experimental_vals[non_duplicate_indices]
+    non_dup_pbe0 = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
+    non_dup_cam = np.array([i for i, smiles in enumerate(smiles_list) if smiles not in smiles_list[:i]])
+    pbe0_vals = pbe0_vals[non_dup_pbe0]
+    cam_vals = cam_vals[non_dup_cam]
+
+    # molecules with dft values to be split into train/test
+    if theory_level == 'CAM-B3LYP':
+        X_with_dft = np.delete(X, np.argwhere(np.isnan(cam_vals)), axis=0)
+        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(cam_vals)))
+        # DFT values for the CAM-B3LYP level of theory
+        dft_vals = np.delete(cam_vals, np.argwhere(np.isnan(cam_vals)))
+        # molecules with no dft vals must go into the training set.
+        X_no_dft = np.delete(X, np.argwhere(~np.isnan(cam_vals)), axis=0)
+        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(cam_vals)))
+    else:
+        X_with_dft = np.delete(X, np.argwhere(np.isnan(pbe0_vals)), axis=0)
+        y_with_dft = np.delete(experimental_vals, np.argwhere(np.isnan(pbe0_vals)))
+        # DFT values for the PBE0 level of theory
+        dft_vals = np.delete(pbe0_vals, np.argwhere(np.isnan(pbe0_vals)))
+        # molecules with no dft vals must go into the training set.
+        X_no_dft = np.delete(X, np.argwhere(~np.isnan(pbe0_vals)), axis=0)
+        y_no_dft = np.delete(experimental_vals, np.argwhere(~np.isnan(pbe0_vals)))
+
+    # Load in the other property values for multitask learning. e_iso_pi is a always the task in this instance.
+
+    data_loader_z_iso_pi = TaskDataLoader('z_iso_pi', path)
+    data_loader_e_iso_n = TaskDataLoader('e_iso_n', path)
+    data_loader_z_iso_n = TaskDataLoader('z_iso_n', path)
+
+    smiles_list_z_iso_pi, y_z_iso_pi = data_loader_z_iso_pi.load_property_data()
+    smiles_list_e_iso_n, y_e_iso_n = data_loader_e_iso_n.load_property_data()
+    smiles_list_z_iso_n, y_z_iso_n = data_loader_z_iso_n.load_property_data()
+
+    y_z_iso_pi = y_z_iso_pi.reshape(-1, 1)
+    y_e_iso_n = y_e_iso_n.reshape(-1, 1)
+    y_z_iso_n = y_z_iso_n.reshape(-1, 1)
+
+    X_z_iso_pi = featurise_mols(smiles_list_z_iso_pi, representation)
+    X_e_iso_n = featurise_mols(smiles_list_e_iso_n, representation)
+    X_z_iso_n = featurise_mols(smiles_list_z_iso_n, representation)
+
+    output_dim = 4  # Number of outputs
+    rank = 1  # Rank of W
+    feature_dim = len(X_no_dft[0, :])
+
+    tanimoto_active_dims = [i for i in range(feature_dim)]  # active dims for Tanimoto base kernel.
+
+    mae_list = []
+    dft_mae_list = []
+
+    # We define the Gaussian Process optimisation objective
+
+    m = None
+
+    def objective_closure():
+        return -m.log_marginal_likelihood()
+
+    print('\nBeginning training loop...')
+
+    for i in range(len(y_with_dft)):
+
+        X_train = np.delete(X_with_dft, i, axis=0)
+        y_train = np.delete(y_with_dft, i)
+        X_test = X_with_dft[i].reshape(1, -1)
+        y_test = y_with_dft[i]
+        dft_test = dft_vals[i]
+
+        X_train = np.concatenate((X_train, X_no_dft))
+        y_train = np.concatenate((y_train, y_no_dft))
+        y_train = y_train.reshape(-1, 1)
+        y_test = y_test.reshape(-1, 1)
+
+        X_train = X_train.astype(np.float64)
+        X_test = X_test.astype(np.float64)
+
+        # Augment the input with zeroes, ones, twos, threes to indicate the required output dimension
+        X_augmented = np.vstack((np.append(X_train, np.zeros((len(X_train), 1)), axis=1),
+                                 np.append(X_z_iso_pi, np.ones((len(X_z_iso_pi), 1)), axis=1),
+                                 np.append(X_e_iso_n, np.ones((len(X_e_iso_n), 1)) * 2, axis=1),
+                                 np.append(X_z_iso_n, np.ones((len(X_z_iso_n), 1)) * 3, axis=1)))
+
+        X_test = np.append(X_test, np.zeros((len(X_test), 1)), axis=1)
+        X_train = np.append(X_train, np.zeros((len(X_train), 1)), axis=1)
+
+        # Augment the Y data with zeroes, ones, twos and threes that specify a likelihood from the list of likelihoods
+        Y_augmented = np.vstack((np.hstack((y_train, np.zeros_like(y_train))),
+                                 np.hstack((y_z_iso_pi, np.ones_like(y_z_iso_pi))),
+                                 np.hstack((y_e_iso_n, np.ones_like(y_e_iso_n) * 2)),
+                                 np.hstack((y_z_iso_n, np.ones_like(y_z_iso_n) * 3))))
+
+        y_test = np.hstack((y_test, np.zeros_like(y_test)))
+
+        # Base kernel
+        k = Tanimoto(active_dims=tanimoto_active_dims)
+        #set_trainable(k.variance, False)
+
+        # Coregion kernel
+        coreg = gpflow.kernels.Coregion(output_dim=output_dim, rank=rank, active_dims=[feature_dim])
+
+        # Create product kernel
+        kern = k * coreg
+
+        # This likelihood switches between Gaussian noise with different variances for each f_i:
+        lik = gpflow.likelihoods.SwitchedLikelihood([gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian(),
+                                                     gpflow.likelihoods.Gaussian(), gpflow.likelihoods.Gaussian()])
+
+        # now build the GP model as normal
+        m = gpflow.models.VGP((X_augmented, Y_augmented), mean_function=Constant(np.mean(y_train[:, 0])), kernel=kern, likelihood=lik)
+
+        # fit the covariance function parameters
+        maxiter = ci_niter(1000)
+        gpflow.optimizers.Scipy().minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=maxiter), method="L-BFGS-B",)
+        print_summary(m)
+
+        # Output Standardised RMSE and RMSE on Train Set
+
+        y_pred_train, _ = m.predict_f(X_train)
+        train_rmse_stan = np.sqrt(mean_squared_error(y_train, y_pred_train))
+        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
+        print("\nStandardised Train RMSE: {:.3f}".format(train_rmse_stan))
+        print("Train RMSE: {:.3f}".format(train_rmse))
+
+        # mean and variance GP prediction
+
+        y_pred, y_var = m.predict_f(X_test)
+
+        # Output MAE for this trial
+
+        mae = abs(y_test[:, 0] - y_pred)
+
+        print("MAE: {}".format(mae))
+
+        # Store values in order to compute the mean and standard error of the statistics across trials
+
+        mae_list.append(mae)
+
+        # DFT prediction scores on the same trial
+
+        dft_mae = abs(y_test[:, 0] - dft_test)
+
+        dft_mae_list.append(dft_mae)
+
+    mae_list = np.array(mae_list)
+    dft_mae_list = np.array(dft_mae_list)
+
+    print("\nmean GP-Tanimoto MAE: {:.4f} +- {:.4f}\n".format(np.mean(mae_list), np.std(mae_list)/np.sqrt(len(mae_list))))
+    print("mean {} MAE: {:.4f} +- {:.4f}\n".format(theory_level, np.mean(dft_mae_list), np.std(dft_mae_list)/np.sqrt(len(dft_mae_list))))
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv',
+                        help='Path to the photoswitches.csv file.')
+    parser.add_argument('-pd', '--path_to_dft_dataset', type=str, default='../dataset/dft_comparison.csv',
+                        help='str giving path to dft_comparison.csv file')
+    parser.add_argument('-r', '--representation', type=str, default='fragprints',
+                        help='str specifying the molecular representation. '
+                             'One of [fingerprints, fragments, fragprints].')
+    parser.add_argument('-th', '--theory_level', type=str, default='PBE0',
+                        help='level of theory to compare against - CAM-B3LYP or PBE0 [CAM-B3LYP, PBE0]')
+
+    args = parser.parse_args()
+
+    main(args.path, args.path_to_dft_dataset, args.representation, args.theory_level)
diff --git a/dft_comparison/dft_k_fold_comparison_with_GPR.py b/dft_comparison/dft_k_fold_comparison_with_GPR.py
@@ -73,7 +73,7 @@ def objective_closure():
 
     for i in range(5):
 
-        X_train, X_test, y_train, y_test = train_test_split(X_with_dft, y_with_dft, test_size=0.6, random_state=i)
+        X_train, X_test, y_train, y_test = train_test_split(X_with_dft, y_with_dft, test_size=0.5, random_state=i)
         X_dud, _, _, dft_test = train_test_split(X_with_dft, dft_vals, test_size=0.6, random_state=i)
 
         X_train = np.concatenate((X_train, X_no_dft))

diff --git a/human_comparison/human_performance_comparison.py b/human_comparison/human_performance_comparison.py
@@ -1,7 +1,7 @@
 # Copyright Ryan-Rhys Griffiths and Aditya Raymond Thawani 2020
 # Author: Ryan-Rhys Griffiths
 """
-Script for comparing against human performance on a set of 5 molecules.
+Script for comparing against human performance on a set of 5 molecules with Tanimoto GP.
 """
 
 import argparse
@@ -16,13 +16,13 @@
 from kernels import Tanimoto
 
 
-def main(path, task, representation):
+def main(path, representation):
     """
     :param path: str specifying path to dataset.
-    :param task: str specifying the task. Always e_iso_pi in the case of the human performance comparison
     :param representation: str specifying the molecular representation. One of ['fingerprints, 'fragments', 'fragprints']
     """
 
+    task = 'e_iso_pi'  # Always e_iso_pi for human performance comparison
     data_loader = TaskDataLoader(task, path)
     smiles_list, y = data_loader.load_property_data()
     X = featurise_mols(smiles_list, representation)
@@ -112,12 +112,10 @@ def objective_closure():
 
     parser.add_argument('-p', '--path', type=str, default='../dataset/photoswitches.csv',
                         help='Path to the photoswitches.csv file.')
-    parser.add_argument('-t', '--task', type=str, default='e_iso_pi',
-                        help='str specifying the task. Always e_iso_pi in the case of the human performance comparison')
     parser.add_argument('-r', '--representation', type=str, default='fragprints',
                         help='str specifying the molecular representation. '
                              'One of [fingerprints, fragments, fragprints].')
 
     args = parser.parse_args()
 
-    main(args.path, args.task, args.representation)
+    main(args.path, args.representation)