Skip to content

Commit 33ed1b4

Browse files
authored
Merge pull request #11 from CDDLeiden/refactor/clarify_scaffold_names
Rename scaffold class names to be more descriptive
2 parents 4d37019 + 23d2129 commit 33ed1b4

File tree

7 files changed

+33
-45
lines changed

7 files changed

+33
-45
lines changed

qsprpred/data/chem/clustering.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from rdkit import Chem, DataStructs
77
from rdkit.SimDivFilters import rdSimDivPickers
88

9-
from .scaffolds import Murcko, Scaffold
9+
from .scaffolds import BemisMurckoRDKit, Scaffold
1010
from .. import MoleculeTable
1111
from ..descriptors.fingerprints import Fingerprint, MorganFP
1212
from ...logs import logger
@@ -89,7 +89,7 @@ class ScaffoldClusters(MoleculeClusters):
8989
scaffold (Scaffold): scaffold generator
9090
"""
9191

92-
def __init__(self, scaffold: Scaffold = Murcko()):
92+
def __init__(self, scaffold: Scaffold = BemisMurckoRDKit()):
9393
super().__init__()
9494
self.scaffold = scaffold
9595

qsprpred/data/chem/scaffolds.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,20 +33,14 @@ def supportsParallel(self) -> bool:
3333
return True
3434

3535

36-
class Murcko(Scaffold):
37-
"""Class for calculating Murcko scaffolds of a given molecule."""
36+
class BemisMurckoRDKit(Scaffold):
37+
"""Class for calculating Murcko scaffolds of a given molecule
38+
using the default implementation in RDKit. If you want, an implementation
39+
closer to the original paper, see the `BemisMurcko` class.
3840
39-
def __call__(self, mols, props, *args, **kwargs):
40-
"""
41-
Calculate the Murcko scaffold for a molecule as implemented
42-
in RDKit.
43-
44-
Args:
45-
mol: SMILES as `str` or an instance of `Mol`
41+
"""
4642

47-
Returns:
48-
SMILES of the Murcko scaffold as `str`
49-
"""
43+
def __call__(self, mols, props, *args, **kwargs):
5044
res = []
5145
for mol in mols:
5246
mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol
@@ -55,7 +49,7 @@ def __call__(self, mols, props, *args, **kwargs):
5549
return pd.Series(res, index=props[self.idProp])
5650

5751
def __str__(self):
58-
return "Murcko"
52+
return "BemisMurckoRDKit"
5953

6054

6155
class BemisMurcko(Scaffold):
@@ -114,15 +108,6 @@ def findTerminalAtoms(mol):
114108
return res
115109

116110
def __call__(self, mols, props, *args, **kwargs):
117-
"""
118-
Calculate the Bemis-Murcko scaffold for a molecule.
119-
120-
Args:
121-
mol: SMILES as `str` or an instance of `Mol`
122-
123-
Returns:
124-
SMILES of the Bemis-Murcko scaffold as `str`
125-
"""
126111
res = []
127112
for mol in mols:
128113
mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol

qsprpred/data/chem/tests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from ... import TargetTasks
55
from ...data import QSPRDataset
6-
from ...data.chem.scaffolds import Murcko, BemisMurcko
6+
from ...data.chem.scaffolds import BemisMurckoRDKit, BemisMurcko
77
from ...utils.testing.base import QSPRTestCase
88
from ...utils.testing.path_mixins import DataSetsPathMixIn
99

@@ -19,7 +19,7 @@ def setUp(self):
1919

2020
@parameterized.expand(
2121
[
22-
("Murcko", Murcko()),
22+
("Murcko", BemisMurckoRDKit()),
2323
("BemisMurcko", BemisMurcko()),
2424
("BemisMurckoCSK", BemisMurcko(True, True)),
2525
("BemisMurckoJustCSK", BemisMurcko(False, True)),

qsprpred/data/sampling/splits.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
RandomClusters,
1919
ScaffoldClusters,
2020
)
21-
from ...data.chem.scaffolds import Murcko, Scaffold
21+
from ...data.chem.scaffolds import BemisMurckoRDKit, Scaffold
2222
from ...data.tables.base import MoleculeDataTable, DataSetDependant
2323
from ...data.tables.qspr import QSPRDataset
2424
from ...logs import logger
@@ -476,7 +476,7 @@ class ScaffoldSplit(GBMTDataSplit):
476476
def __init__(
477477
self,
478478
dataset: QSPRDataset | None = None,
479-
scaffold: Scaffold = Murcko(),
479+
scaffold: Scaffold = BemisMurckoRDKit(),
480480
test_fraction: float = 0.1,
481481
n_folds: int = 1,
482482
custom_test_list: list | None = None,
@@ -552,10 +552,10 @@ def setSeed(self, seed: int | None):
552552
self.seed = seed
553553
if hasattr(self.clustering, "seed"):
554554
self.clustering.seed = seed
555-
555+
556556
def getSeed(self):
557557
"""Get the seed for this instance.
558-
558+
559559
Returns:
560560
int: the seed for this instance or None if no seed is set.
561561
"""

qsprpred/data/sampling/tests.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
FPSimilarityLeaderPickerClusters,
1717
FPSimilarityMaxMinClusters,
1818
)
19-
from ...data.chem.scaffolds import Murcko, BemisMurcko
19+
from ...data.chem.scaffolds import BemisMurckoRDKit, BemisMurcko
2020
from ...data.sampling.folds import FoldsFromDataSplit
2121
from ...data.sampling.splits import ManualSplit
2222
from ...utils.testing.base import QSPRTestCase
@@ -109,13 +109,13 @@ def testTemporalSplit(self, multitask):
109109

110110
@parameterized.expand(
111111
[
112-
(False, Murcko(), None),
112+
(False, BemisMurckoRDKit(), None),
113113
(
114114
False,
115115
BemisMurcko(use_csk=True),
116116
["ScaffoldSplit_000", "ScaffoldSplit_001"],
117117
),
118-
(True, Murcko(), None),
118+
(True, BemisMurckoRDKit(), None),
119119
]
120120
)
121121
def testScaffoldSplit(self, multitask, scaffold, custom_test_list):

qsprpred/data_CLI.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
from sklearn.impute import SimpleImputer
1616
from sklearn.preprocessing import StandardScaler
1717

18+
from qsprpred.data.chem.clustering import (
19+
FPSimilarityMaxMinClusters,
20+
FPSimilarityLeaderPickerClusters,
21+
)
1822
from qsprpred.data.descriptors.fingerprints import (
1923
MorganFP,
2024
RDKitMACCSFP,
@@ -24,10 +28,6 @@
2428
RDKitFP,
2529
AvalonFP,
2630
)
27-
from qsprpred.data.chem.clustering import (
28-
FPSimilarityMaxMinClusters,
29-
FPSimilarityLeaderPickerClusters
30-
)
3131
from qsprpred.data.descriptors.sets import (
3232
DrugExPhyschem,
3333
PredictorDesc,
@@ -49,7 +49,7 @@
4949
)
5050
from qsprpred.data.tables.qspr import QSPRDataset
5151
from qsprpred.tasks import TargetTasks
52-
from .data.chem.scaffolds import Murcko
52+
from .data.chem.scaffolds import BemisMurckoRDKit
5353
from .extra.gpu.models.dnn import DNNModel
5454
from .logs.utils import backup_files, enable_file_logger
5555
from .models.scikit_learn import SklearnModel
@@ -363,7 +363,7 @@ def QSPR_dataprep(args):
363363
else None,
364364
"imputer": SimpleImputer(strategy=args.imputation[prop])
365365
if prop in args.imputation
366-
else None
366+
else None,
367367
}
368368
)
369369
dataset_name = (
@@ -391,7 +391,7 @@ def QSPR_dataprep(args):
391391
if args.split == "scaffold":
392392
split = ScaffoldSplit(
393393
test_fraction=args.split_fraction,
394-
scaffold=Murcko(),
394+
scaffold=BemisMurckoRDKit(),
395395
dataset=mydataset,
396396
)
397397
elif args.split == "time":
@@ -525,12 +525,15 @@ def QSPR_dataprep(args):
525525
os.makedirs(args.output_dir)
526526

527527
# get a list of all the folders in the output directory
528-
folders = [f for f in os.listdir(args.output_dir) if os.path.isdir(f"{args.output_dir}/{f}")]
528+
folders = [
529+
f
530+
for f in os.listdir(args.output_dir)
531+
if os.path.isdir(f"{args.output_dir}/{f}")
532+
]
529533

530534
# remove folders that start with backup
531535
folders = [f for f in folders if not f.startswith("backup")]
532536

533-
534537
if not args.skip_backup:
535538
backup_msg = backup_files(
536539
args.output_dir,

tutorials/basics/data/data_splitting.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,10 +1299,10 @@
12991299
}
13001300
],
13011301
"source": [
1302-
"from qsprpred.data.chem.scaffolds import Murcko\n",
1302+
"from qsprpred.data.chem.scaffolds import BemisMurckoRDKit\n",
13031303
"from qsprpred.data import ScaffoldSplit\n",
13041304
"\n",
1305-
"split = ScaffoldSplit(n_folds=10, scaffold=Murcko())\n",
1305+
"split = ScaffoldSplit(n_folds=10, scaffold=BemisMurckoRDKit())\n",
13061306
"for fold, (X_train, X_test, y_train, y_test, train_index, test_index) in enumerate(\n",
13071307
" dataset.iterFolds(split)):\n",
13081308
" print_cv_split(fold, X_train, X_test, y_train, y_test, train_index, test_index)"
@@ -1805,7 +1805,7 @@
18051805
"source": [
18061806
"CrossValAssessor(\n",
18071807
" scoring=\"roc_auc\",\n",
1808-
" split=ScaffoldSplit(n_folds=10, scaffold=Murcko())\n",
1808+
" split=ScaffoldSplit(n_folds=10, scaffold=BemisMurckoRDKit())\n",
18091809
")(model, dataset)"
18101810
]
18111811
},

0 commit comments

Comments
 (0)