Renamed photoswitch csv file.

leojklarner · Nov 2, 2023 · 9504a61 · 9504a61
1 parent 86b09cd
commit 9504a61
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 11 deletions.
diff --git a/benchmarks/run_benchmark.py b/benchmarks/run_benchmark.py
@@ -34,7 +34,7 @@
     "Lipophilicity": "Lipophilicity",
 }
 dataset_paths = {
-    "Photoswitch": "../data/property_prediction/photoswitches.csv",
+    "Photoswitch": "../data/property_prediction/Photoswitch.csv",
     "ESOL": "../data/property_prediction/ESOL.csv",
     "FreeSolv": "../data/property_prediction/FreeSolv.csv",
     "Lipophilicity": "../data/property_prediction/Lipophilicity.csv",
@@ -55,7 +55,7 @@ def main(
         n_trials: Number of random train/test splits for the datasets. Default is 20
         test_set_size: Size of the test set for evaluation. Default is 0.2
         dataset_name: Benchmark dataset to use. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity']
-        dataset_path: Benchmark dataset path. One of ['../data/property_prediction/photoswitches.csv',
+        dataset_path: Benchmark dataset path. One of ['../data/property_prediction/Photoswitch.csv',
                                                        ../data/property_prediction/ESOL.csv',
                                                        '../data/property_prediction/FreeSolv.csv',
                                                        '../data/property_prediction/Lipophilicity.csv']
@@ -283,7 +283,7 @@ def main(
         "--path",
         type=str,
         default="../data/property_prediction/Lipophilicity.csv",
-        help="Path to the dataset file. One of [../data/property_prediction/photoswitches.csv, "
+        help="Path to the dataset file. One of [../data/property_prediction/Photoswitch.csv, "
         "../data/property_prediction/ESOL.csv, "
         "../data/property_prediction/FreeSolv.csv, "
         "../data/property_prediction/Lipophilicity.csv]",

diff --git a/data/property_prediction/photoswitches.csv → data/property_prediction/Photoswitch.csv b/data/property_prediction/photoswitches.csv → data/property_prediction/Photoswitch.csv
diff --git a/gauche/dataloader/mol_prop.py b/gauche/dataloader/mol_prop.py
@@ -8,7 +8,7 @@
 import pandas as pd
 
 from gauche.dataloader import DataLoader
-from rdkit.Chem import MolFromSmiles
+from rdkit.Chem import MolFromSmiles, MolToSmiles
 
 
 class DataLoaderMP(DataLoader):
@@ -77,9 +77,9 @@ def featurize(
         """
 
         valid_representations = [
-            "fingerprints",
+            "ecfp_fingerprints",
             "fragments",
-            "fragprints",
+            "ecfp_fragprints",
             "graphs",
             "bag_of_smiles",
             "bag_of_selfies",
@@ -98,12 +98,14 @@ def featurize(
 
             self.features = fragments(self.features)
 
-        elif representation == "fragprints":
+        elif representation == "ecfp_fragprints":
             from gauche.representations.fingerprints import ecfp_fingerprints, fragments
 
             self.features = np.concatenate(
                 (
-                    fingerprints(self.features, bond_radius=bond_radius, nBits=nBits),
+                    ecfp_fingerprints(
+                        self.features, bond_radius=bond_radius, nBits=nBits
+                    ),
                     fragments(self.features),
                 ),
                 axis=1,
@@ -183,6 +185,12 @@ def load_benchmark(self, benchmark, path):
                 df[benchmarks[benchmark]["labels"]].dropna().to_numpy().reshape(-1, 1)
             )
 
+            # make SMILES canoncial
+            self.features = [
+                MolToSmiles(MolFromSmiles(smiles), isomericSmiles=False)
+                for smiles in self.features
+            ]
+
         else:
             raise ValueError(
                 f"The specified benchmark choice ({benchmark}) is not a valid option. "

diff --git a/notebooks/pretrained_kernel.py b/notebooks/pretrained_kernel.py
@@ -136,7 +136,7 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs):
 
 # Load the Photoswitch dataset
 loader = DataLoaderMP()
-loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
+loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")
 
 # We use the fragprints representations (a concatenation of Morgan fingerprints and RDKit fragment features)
 y = loader.labels

diff --git a/tests/test_dataloaders/test_molprop_loader.py b/tests/test_dataloaders/test_molprop_loader.py
@@ -0,0 +1,44 @@
+"""
+Pytest-based unit tests for the molecular property 
+prediction data loader.
+"""
+
+import pytest
+
+import os
+import itertools
+from gauche.dataloader import DataLoaderMP
+
+
+@pytest.mark.parametrize(
+    "dataset, representation",
+    [
+        (d, f)
+        for d, f in itertools.product(
+            ["Photoswitch", "ESOL", "FreeSolv", "Lipophilicity"],
+            [
+                "ecfp_fingerprints",
+                "fragments",
+                "ecfp_fragprints",
+                "molecular_graphs",
+                "bag_of_smiles",
+                "bag_of_selfies",
+                "mqn",
+            ],
+        )
+    ],
+)
+def test_benchmark_loading(dataset, representation):
+    """
+    Test if all benchmarks can be loaded with all representation.
+    """
+
+    dataset_root = os.path.abspath(
+        os.path.join("..", "..", "data", "property_prediction")
+    )
+
+    dataloader = DataLoaderMP()
+    dataloader.load_benchmark(
+        dataset, path=os.path.join(dataset_root, dataset + ".csv")
+    )
+    dataloader.featurize(representation)
diff --git a/tests/test_gp.py b/tests/test_gp.py
@@ -49,7 +49,7 @@ class TestReproducibility(unittest.TestCase):
     def setUp(self):
         torch.manual_seed(42); np.random.seed(42)
         loader = DataLoaderMP()
-        loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
+        loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")
 
         loader.featurize('fragprints')
         X = loader.features
@@ -120,7 +120,7 @@ def forward(self, x):
 class TestGraphKernel(unittest.TestCase):
     def setUp(self):
         loader = DataLoaderMP()
-        loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
+        loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")
         bond_types = {1.0: 'S', 1.5: 'A', 2.0: 'D', 3.0: 'O'}
 
         def to_graph(mol):