Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:CDDLeiden/QSPRpred into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-sicho committed Mar 22, 2024
2 parents 0ea02ea + 33ed1b4 commit a98097f
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 45 deletions.
4 changes: 2 additions & 2 deletions qsprpred/data/chem/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from rdkit import Chem, DataStructs
from rdkit.SimDivFilters import rdSimDivPickers

from .scaffolds import Murcko, Scaffold
from .scaffolds import BemisMurckoRDKit, Scaffold
from .. import MoleculeTable
from ..descriptors.fingerprints import Fingerprint, MorganFP
from ...logs import logger
Expand Down Expand Up @@ -89,7 +89,7 @@ class ScaffoldClusters(MoleculeClusters):
scaffold (Scaffold): scaffold generator
"""

def __init__(self, scaffold: Scaffold = Murcko()):
def __init__(self, scaffold: Scaffold = BemisMurckoRDKit()):
super().__init__()
self.scaffold = scaffold

Expand Down
29 changes: 7 additions & 22 deletions qsprpred/data/chem/scaffolds.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,14 @@ def supportsParallel(self) -> bool:
return True


class Murcko(Scaffold):
"""Class for calculating Murcko scaffolds of a given molecule."""
class BemisMurckoRDKit(Scaffold):
"""Class for calculating Murcko scaffolds of a given molecule
using the default implementation in RDKit. If you want, an implementation
closer to the original paper, see the `BemisMurcko` class.
def __call__(self, mols, props, *args, **kwargs):
"""
Calculate the Murcko scaffold for a molecule as implemented
in RDKit.
Args:
mol: SMILES as `str` or an instance of `Mol`
"""

Returns:
SMILES of the Murcko scaffold as `str`
"""
def __call__(self, mols, props, *args, **kwargs):
res = []
for mol in mols:
mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol
Expand All @@ -55,7 +49,7 @@ def __call__(self, mols, props, *args, **kwargs):
return pd.Series(res, index=props[self.idProp])

def __str__(self):
return "Murcko"
return "BemisMurckoRDKit"


class BemisMurcko(Scaffold):
Expand Down Expand Up @@ -114,15 +108,6 @@ def findTerminalAtoms(mol):
return res

def __call__(self, mols, props, *args, **kwargs):
"""
Calculate the Bemis-Murcko scaffold for a molecule.
Args:
mol: SMILES as `str` or an instance of `Mol`
Returns:
SMILES of the Bemis-Murcko scaffold as `str`
"""
res = []
for mol in mols:
mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol
Expand Down
4 changes: 2 additions & 2 deletions qsprpred/data/chem/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from ... import TargetTasks
from ...data import QSPRDataset
from ...data.chem.scaffolds import Murcko, BemisMurcko
from ...data.chem.scaffolds import BemisMurckoRDKit, BemisMurcko
from ...utils.testing.base import QSPRTestCase
from ...utils.testing.path_mixins import DataSetsPathMixIn

Expand All @@ -19,7 +19,7 @@ def setUp(self):

@parameterized.expand(
[
("Murcko", Murcko()),
("Murcko", BemisMurckoRDKit()),
("BemisMurcko", BemisMurcko()),
("BemisMurckoCSK", BemisMurcko(True, True)),
("BemisMurckoJustCSK", BemisMurcko(False, True)),
Expand Down
8 changes: 4 additions & 4 deletions qsprpred/data/sampling/splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
RandomClusters,
ScaffoldClusters,
)
from ...data.chem.scaffolds import Murcko, Scaffold
from ...data.chem.scaffolds import BemisMurckoRDKit, Scaffold
from ...data.tables.base import MoleculeDataTable, DataSetDependant
from ...data.tables.qspr import QSPRDataset
from ...logs import logger
Expand Down Expand Up @@ -476,7 +476,7 @@ class ScaffoldSplit(GBMTDataSplit):
def __init__(
self,
dataset: QSPRDataset | None = None,
scaffold: Scaffold = Murcko(),
scaffold: Scaffold = BemisMurckoRDKit(),
test_fraction: float = 0.1,
n_folds: int = 1,
custom_test_list: list | None = None,
Expand Down Expand Up @@ -552,10 +552,10 @@ def setSeed(self, seed: int | None):
self.seed = seed
if hasattr(self.clustering, "seed"):
self.clustering.seed = seed

def getSeed(self):
"""Get the seed for this instance.
Returns:
int: the seed for this instance or None if no seed is set.
"""
Expand Down
6 changes: 3 additions & 3 deletions qsprpred/data/sampling/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
FPSimilarityLeaderPickerClusters,
FPSimilarityMaxMinClusters,
)
from ...data.chem.scaffolds import Murcko, BemisMurcko
from ...data.chem.scaffolds import BemisMurckoRDKit, BemisMurcko
from ...data.sampling.folds import FoldsFromDataSplit
from ...data.sampling.splits import ManualSplit
from ...utils.testing.base import QSPRTestCase
Expand Down Expand Up @@ -109,13 +109,13 @@ def testTemporalSplit(self, multitask):

@parameterized.expand(
[
(False, Murcko(), None),
(False, BemisMurckoRDKit(), None),
(
False,
BemisMurcko(use_csk=True),
["ScaffoldSplit_000", "ScaffoldSplit_001"],
),
(True, Murcko(), None),
(True, BemisMurckoRDKit(), None),
]
)
def testScaffoldSplit(self, multitask, scaffold, custom_test_list):
Expand Down
21 changes: 12 additions & 9 deletions qsprpred/data_CLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from qsprpred.data.chem.clustering import (
FPSimilarityMaxMinClusters,
FPSimilarityLeaderPickerClusters,
)
from qsprpred.data.descriptors.fingerprints import (
MorganFP,
RDKitMACCSFP,
Expand All @@ -24,10 +28,6 @@
RDKitFP,
AvalonFP,
)
from qsprpred.data.chem.clustering import (
FPSimilarityMaxMinClusters,
FPSimilarityLeaderPickerClusters
)
from qsprpred.data.descriptors.sets import (
DrugExPhyschem,
PredictorDesc,
Expand All @@ -49,7 +49,7 @@
)
from qsprpred.data.tables.qspr import QSPRDataset
from qsprpred.tasks import TargetTasks
from .data.chem.scaffolds import Murcko
from .data.chem.scaffolds import BemisMurckoRDKit
from .extra.gpu.models.dnn import DNNModel
from .logs.utils import backup_files, enable_file_logger
from .models.scikit_learn import SklearnModel
Expand Down Expand Up @@ -363,7 +363,7 @@ def QSPR_dataprep(args):
else None,
"imputer": SimpleImputer(strategy=args.imputation[prop])
if prop in args.imputation
else None
else None,
}
)
dataset_name = (
Expand Down Expand Up @@ -391,7 +391,7 @@ def QSPR_dataprep(args):
if args.split == "scaffold":
split = ScaffoldSplit(
test_fraction=args.split_fraction,
scaffold=Murcko(),
scaffold=BemisMurckoRDKit(),
dataset=mydataset,
)
elif args.split == "time":
Expand Down Expand Up @@ -525,12 +525,15 @@ def QSPR_dataprep(args):
os.makedirs(args.output_dir)

# get a list of all the folders in the output directory
folders = [f for f in os.listdir(args.output_dir) if os.path.isdir(f"{args.output_dir}/{f}")]
folders = [
f
for f in os.listdir(args.output_dir)
if os.path.isdir(f"{args.output_dir}/{f}")
]

# remove folders that start with backup
folders = [f for f in folders if not f.startswith("backup")]


if not args.skip_backup:
backup_msg = backup_files(
args.output_dir,
Expand Down
6 changes: 3 additions & 3 deletions tutorials/basics/data/data_splitting.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1299,10 +1299,10 @@
}
],
"source": [
"from qsprpred.data.chem.scaffolds import Murcko\n",
"from qsprpred.data.chem.scaffolds import BemisMurckoRDKit\n",
"from qsprpred.data import ScaffoldSplit\n",
"\n",
"split = ScaffoldSplit(n_folds=10, scaffold=Murcko())\n",
"split = ScaffoldSplit(n_folds=10, scaffold=BemisMurckoRDKit())\n",
"for fold, (X_train, X_test, y_train, y_test, train_index, test_index) in enumerate(\n",
" dataset.iterFolds(split)):\n",
" print_cv_split(fold, X_train, X_test, y_train, y_test, train_index, test_index)"
Expand Down Expand Up @@ -1805,7 +1805,7 @@
"source": [
"CrossValAssessor(\n",
" scoring=\"roc_auc\",\n",
" split=ScaffoldSplit(n_folds=10, scaffold=Murcko())\n",
" split=ScaffoldSplit(n_folds=10, scaffold=BemisMurckoRDKit())\n",
")(model, dataset)"
]
},
Expand Down

0 comments on commit a98097f

Please sign in to comment.