Skip to content

Commit

Permalink
Renamed photoswitch csv file.
Browse files Browse the repository at this point in the history
  • Loading branch information
leojklarner committed Nov 2, 2023
1 parent 86b09cd commit 9504a61
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 11 deletions.
6 changes: 3 additions & 3 deletions benchmarks/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"Lipophilicity": "Lipophilicity",
}
dataset_paths = {
"Photoswitch": "../data/property_prediction/photoswitches.csv",
"Photoswitch": "../data/property_prediction/Photoswitch.csv",
"ESOL": "../data/property_prediction/ESOL.csv",
"FreeSolv": "../data/property_prediction/FreeSolv.csv",
"Lipophilicity": "../data/property_prediction/Lipophilicity.csv",
Expand All @@ -55,7 +55,7 @@ def main(
n_trials: Number of random train/test splits for the datasets. Default is 20
test_set_size: Size of the test set for evaluation. Default is 0.2
dataset_name: Benchmark dataset to use. One of ['Photoswitch', 'ESOL', 'FreeSolv', 'Lipophilicity']
dataset_path: Benchmark dataset path. One of ['../data/property_prediction/photoswitches.csv',
dataset_path: Benchmark dataset path. One of ['../data/property_prediction/Photoswitch.csv',
../data/property_prediction/ESOL.csv',
'../data/property_prediction/FreeSolv.csv',
'../data/property_prediction/Lipophilicity.csv']
Expand Down Expand Up @@ -283,7 +283,7 @@ def main(
"--path",
type=str,
default="../data/property_prediction/Lipophilicity.csv",
help="Path to the dataset file. One of [../data/property_prediction/photoswitches.csv, "
help="Path to the dataset file. One of [../data/property_prediction/Photoswitch.csv, "
"../data/property_prediction/ESOL.csv, "
"../data/property_prediction/FreeSolv.csv, "
"../data/property_prediction/Lipophilicity.csv]",
Expand Down
File renamed without changes.
18 changes: 13 additions & 5 deletions gauche/dataloader/mol_prop.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd

from gauche.dataloader import DataLoader
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import MolFromSmiles, MolToSmiles


class DataLoaderMP(DataLoader):
Expand Down Expand Up @@ -77,9 +77,9 @@ def featurize(
"""

valid_representations = [
"fingerprints",
"ecfp_fingerprints",
"fragments",
"fragprints",
"ecfp_fragprints",
"graphs",
"bag_of_smiles",
"bag_of_selfies",
Expand All @@ -98,12 +98,14 @@ def featurize(

self.features = fragments(self.features)

elif representation == "fragprints":
elif representation == "ecfp_fragprints":
from gauche.representations.fingerprints import ecfp_fingerprints, fragments

self.features = np.concatenate(
(
fingerprints(self.features, bond_radius=bond_radius, nBits=nBits),
ecfp_fingerprints(
self.features, bond_radius=bond_radius, nBits=nBits
),
fragments(self.features),
),
axis=1,
Expand Down Expand Up @@ -183,6 +185,12 @@ def load_benchmark(self, benchmark, path):
df[benchmarks[benchmark]["labels"]].dropna().to_numpy().reshape(-1, 1)
)

# make SMILES canoncial
self.features = [
MolToSmiles(MolFromSmiles(smiles), isomericSmiles=False)
for smiles in self.features
]

else:
raise ValueError(
f"The specified benchmark choice ({benchmark}) is not a valid option. "
Expand Down
2 changes: 1 addition & 1 deletion notebooks/pretrained_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs):

# Load the Photoswitch dataset
loader = DataLoaderMP()
loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")

# We use the fragprints representations (a concatenation of Morgan fingerprints and RDKit fragment features)
y = loader.labels
Expand Down
44 changes: 44 additions & 0 deletions tests/test_dataloaders/test_molprop_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Pytest-based unit tests for the molecular property
prediction data loader.
"""

import pytest

import os
import itertools
from gauche.dataloader import DataLoaderMP


@pytest.mark.parametrize(
"dataset, representation",
[
(d, f)
for d, f in itertools.product(
["Photoswitch", "ESOL", "FreeSolv", "Lipophilicity"],
[
"ecfp_fingerprints",
"fragments",
"ecfp_fragprints",
"molecular_graphs",
"bag_of_smiles",
"bag_of_selfies",
"mqn",
],
)
],
)
def test_benchmark_loading(dataset, representation):
"""
Test if all benchmarks can be loaded with all representation.
"""

dataset_root = os.path.abspath(
os.path.join("..", "..", "data", "property_prediction")
)

dataloader = DataLoaderMP()
dataloader.load_benchmark(
dataset, path=os.path.join(dataset_root, dataset + ".csv")
)
dataloader.featurize(representation)
4 changes: 2 additions & 2 deletions tests/test_gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class TestReproducibility(unittest.TestCase):
def setUp(self):
torch.manual_seed(42); np.random.seed(42)
loader = DataLoaderMP()
loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")

loader.featurize('fragprints')
X = loader.features
Expand Down Expand Up @@ -120,7 +120,7 @@ def forward(self, x):
class TestGraphKernel(unittest.TestCase):
def setUp(self):
loader = DataLoaderMP()
loader.load_benchmark("Photoswitch", "../data/property_prediction/photoswitches.csv")
loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")
bond_types = {1.0: 'S', 1.5: 'A', 2.0: 'D', 3.0: 'O'}

def to_graph(mol):
Expand Down

0 comments on commit 9504a61

Please sign in to comment.