Merge branch 'enhancement/descriptor_prefixes' into 'dev'

Minor refactoring of descriptor sets to make them more user-friendly Closes #88 See merge request cdd/QSPRpred!179
CDDLeiden · Mar 28, 2024 · e466e54 · e466e54
2 parents b981855 + 7a8f5be
commit e466e54
Show file tree

Hide file tree

Showing 13 changed files with 187 additions and 84 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,45 +1,39 @@
 # Change Log
 
-From v3.0.1 to v3.0.2
+From v3.0.2 to v3.1.0
 
 ## Fixes
 
-- Fixed a bug where an attached standardizer would be refit when calling
-  `QSPRModel.predictMols` with `use_applicability_domain=True`.
-- Fixed a bug with `use_applicability_domain=True` in `QSPRModel.predictMols`
-  where an error would be raised if there were invalid molecules in the input.
-- Fixed a bug where dataset type was not properly set to numeric
-  in `MlChemADWrapper.contains`
 - Fixed a bug in `QSPRDataset` where property transformations were not applied.
 - Fixed a bug where an attached standardizer would be refit when calling
   `QSPRModel.predictMols` with `use_applicability_domain=True`.
 - Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`.
-- Fixed a bug where class ratios were shuffled in the `RatioDistributionAlgorithm`.
 
 ## Changes
 
-- The module containing the sole model base class (`QSPRModel`) was renamed
-  from `models` to `model`.
-- Restrictions on `numpy` versions were removed to allow for more flexibility in
-  package installations. However, the `BorutaFilter` feature selection method does not
-  function with `numpy` versions 1.24.0 and above. Therefore, this functionality now
-  requires a downgrade to `numpy` version 1.23.0 or lower. This was reflected in the
-  documentation and `numpy` itself outputs a reasonable error message if the version is
-  incompatible.
-- Data type in `MlChemADWrapper` is now set to `float64` by default, instead
-  of `float32`.
-- Saving of models after hyperparameter optimization was improved to ensure parameters
-  are always propagated to the underlying estimator as well.
+- renamed `PandasDataTable.transform` to `PandasDataTable.transformProperties`
+- moved `imputeProperties`, `dropEmptyProperties` and `hasProperty` from `MoleculeTable`
+  to `PandasDataTable`.
+- removed `getProperties`, `addProperty`, `removeProperty`, now use `PandasDataTable`
+  methods directly.
+- Since the way descriptors are saved has changed, this release is incompatible with
+  previous data sets and models. However, these can be easily converted to the new
+  format by adding
+  a prefix with descriptor set name to the old descriptor tables. Feel free to contact
+  us if you require assistance with this.
 
 ## New Features
 
-- The `DataFrameDescriptorSet` class was extended to allow more flexibility when joining
-  custom descriptor sets.
-- Added the `prepMols` method to `DescriptorSet` to allow separated customization of
-  molecule preparation before descriptor calculation.
-- The package can now be installed from the PyPI repository 🐍📦.
-- New argument (`refit_optimal`) was added to `HyperparameterOptimization.optimize()`
-  method to make refitting of the model with optimal parameters easier.
+- Descriptors are now saved with prefixes to indicate the descriptor sets. This reduces
+  the chance of name collisions when using multiple descriptor sets.
+- Added new methods to `MoleculeTable` and `QSARDataset` for more fine-grained control
+  of clearing, dropping and restoring of descriptor sets calculated for the dataset.
+    - `dropDescriptorSets` will drop descriptors associated with the given descriptor
+      sets.
+    - `dropDescriptors` will drop individual descriptors associated with the given
+      descriptor sets and properties.
+    - All drop actions are restorable with `restoreDescriptorSets` unless explicitly
+      cleared from the data set with the `clear` parameter of `dropDescriptorSets`.
 
 ## Removed Features
 

diff --git a/qsprpred/data/descriptors/fingerprints.py b/qsprpred/data/descriptors/fingerprints.py
@@ -72,8 +72,9 @@ def __call__(
         values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs)
         values = values[:, self.usedBits]
         values = values.astype(self.dtype)
-        df = pd.DataFrame(values, index=props[self.idProp])
-        df.columns = self.descriptors
+        df = pd.DataFrame(
+            values, index=props[self.idProp], columns=self.transformToFeatureNames()
+        )
         return df
 
 

diff --git a/qsprpred/data/descriptors/sets.py b/qsprpred/data/descriptors/sets.py
@@ -135,9 +135,17 @@ def __call__(
         Returns:
             data frame of descriptor values of shape (n_mols, n_descriptors)
         """
+        mols = self.iterMols(mols, to_list=True)
         values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs)
-        df = pd.DataFrame(values, index=props[self.idProp])
-        df.columns = self.descriptors
+        # check if descriptors have unique names
+        assert len(set(self.descriptors)) == len(
+            self.descriptors
+        ), f"Descriptor names are not unique for set '{self}': {self.descriptors}"
+        df = pd.DataFrame(
+            values,
+            index=props[self.idProp],
+            columns=self.transformToFeatureNames(),
+        )
         try:
             df = df.astype(self.dtype)
         except ValueError as exp:
@@ -153,6 +161,9 @@ def __call__(
             )
         return df
 
+    def transformToFeatureNames(self):
+        return [f"{self}_{x}" for x in self.descriptors]
+
     @abstractmethod
     def getDescriptors(
         self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs

diff --git a/qsprpred/data/descriptors/tests.py b/qsprpred/data/descriptors/tests.py
@@ -27,6 +27,49 @@ def setUp(self):
         super().setUp()
         self.setUpPaths()
 
+    @staticmethod
+    def getDescList():
+        return [MorganFP(radius=3, nBits=256), DrugExPhyschem()]
+
+    def testDropping(self):
+        """Test dropping of descriptors from data sets."""
+        dataset = self.createLargeTestDataSet("TestDropping")
+        # test dropping of all sets
+        dataset.addDescriptors(self.getDescList())
+        full_len = sum(len(x) for x in dataset.descriptorSets)
+        self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len)
+        dataset.dropDescriptorSets(dataset.descriptorSets)
+        self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0)
+        dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True)
+        self.assertEqual(len(dataset.descriptors), 0)
+        dataset.addDescriptors(self.getDescList())
+        dataset.dropDescriptorSets([str(x) for x in self.getDescList()])
+        self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0)
+        dataset.dropDescriptorSets(
+            [str(x) for x in self.getDescList()], full_removal=True
+        )
+        self.assertEqual(len(dataset.descriptors), 0)
+        # test dropping of single set
+        dataset.addDescriptors(self.getDescList())
+        self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len)
+        dataset.dropDescriptorSets([dataset.descriptorSets[0]])
+        self.assertEqual(
+            dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1])
+        )
+        dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True)
+        dataset.addDescriptors(self.getDescList())
+        dataset.dropDescriptorSets([str(dataset.descriptorSets[0])], full_removal=True)
+        self.assertEqual(
+            dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1])
+        )
+        # test restoring of dropped sets
+        dataset.addDescriptors(self.getDescList())
+        self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len)
+        dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=False)
+        self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0)
+        dataset.restoreDescriptorSets(dataset.descriptorSets)
+        self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len)
+
     @parameterized.expand([(None, None), (1, None), (2, None), (4, 50)])
     def testSwitching(self, n_cpu, chunk_size):
         """Test if the feature calculator can be switched to a new dataset."""
@@ -163,14 +206,16 @@ def setUp(self):
         super().setUp()
         self.setUpPaths()
 
-    @parameterized.expand([
-        (
-            f"{desc_set}_{TargetTasks.REGRESSION}",
-            desc_set,
-            [{"name": "CL", "task": TargetTasks.REGRESSION}],
-        )
-        for desc_set in DataSetsPathMixIn.getAllDescriptors()
-    ])
+    @parameterized.expand(
+        [
+            (
+                f"{desc_set}_{TargetTasks.REGRESSION}",
+                desc_set,
+                [{"name": "CL", "task": TargetTasks.REGRESSION}],
+            )
+            for desc_set in DataSetsPathMixIn.getAllDescriptors()
+        ]
+    )
     def testDescriptorsAll(self, _, desc_set, target_props):
         """Tests all available descriptor sets.
 

diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py
@@ -156,7 +156,7 @@ def setUp(self):
         self.descriptors = self.dataset.featureNames
 
     def recalculateWithMultiIndex(self):
-        self.dataset.dropDescriptors(self.dataset.descriptorSets)
+        self.dataset.dropDescriptorSets(self.dataset.descriptorSets, full_removal=True)
         self.df_descriptors["ID_COL1"] = (
             self.dataset.getProperty(self.dataset.idProp)
             .apply(lambda x: x.split("_")[0])
@@ -178,13 +178,12 @@ def recalculateWithMultiIndex(self):
             ]
         )
 
-    # def testDefaultDescriptorAdd(self):
-    #     """Test adding without index columns."""
-    #     # TODO: issue 88 needs to be solved for this to work
-    #     self.dataset.nJobs = 1
-    #     df_new = self.dataset.getFeatures(concat=True).copy()
-    #     calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc")
-    #     self.dataset.addDescriptors([calc])
+    def testDefaultDescriptorAdd(self):
+        """Test adding without index columns."""
+        self.dataset.nJobs = 1
+        df_new = self.dataset.getFeatures(concat=True).copy()
+        calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc")
+        self.dataset.addDescriptors([calc])
 
     @parameterized.expand(
         [

diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py
@@ -107,11 +107,10 @@ def getDescriptorNames(self, active_only=True):
                 the current descriptor set. Defaults to `True`.
 
         """
-        all_descs = self.df.columns[~self.df.columns.isin(self.indexCols)].tolist()
         if active_only:
-            return [x for x in all_descs if x in self.calculator.descriptors]
+            return self.calculator.transformToFeatureNames()
         else:
-            return all_descs
+            return self.df.columns[~self.df.columns.isin(self.indexCols)].tolist()
 
     def fillMissing(self, fill_value, names):
         """Fill missing values in the descriptor table.
@@ -138,8 +137,20 @@ def keepDescriptors(self, descriptors: list[str]) -> list[str]:
         """
         all_descs = self.getDescriptorNames(active_only=False)
         to_keep = set(all_descs) & set(descriptors)
+        prefix = str(self.calculator) + "_"
         self.calculator.descriptors = [
-            x for x in self.calculator.descriptors if x in to_keep
+            x.replace(prefix, "", 1)  # remove prefix
+            for x in self.calculator.transformToFeatureNames()
+            if x in to_keep
+        ]
+        return self.getDescriptorNames()
+
+    def restoreDescriptors(self) -> list[str]:
+        """Restore all descriptors to active in this set."""
+        all_descs = self.getDescriptorNames(active_only=False)
+        prefix = str(self.calculator) + "_"
+        self.calculator.descriptors = [
+            x.replace(prefix, "", 1) for x in all_descs  # remove prefix
         ]
         return self.getDescriptorNames()
 
@@ -646,17 +657,37 @@ def generateDescriptorDataSetName(self, ds_set: str | DescriptorSet):
         """Generate a descriptor set name from a descriptor set."""
         return f"Descriptors_{self.name}_{ds_set}"
 
-    def dropDescriptors(
+    def dropDescriptors(self, descriptors: list[str]):
+        """Drop descriptors by name. Performs a simple feature selection by removing
+        the given descriptor names from the data set.
+
+        Args:
+            descriptors (list[str]): List of descriptor names to drop.
+        """
+        for ds in self.descriptors:
+            calc = ds.calculator
+            ds_names = calc.transformToFeatureNames()
+            to_keep = [x for x in ds_names if x not in descriptors]
+            ds.keepDescriptors(to_keep)
+
+    def dropDescriptorSets(
         self,
-        descriptors: list[DescriptorSet] | list[str],
+        descriptors: list[DescriptorSet | str],
+        full_removal: bool = False,
     ):
         """
-        Drop descriptors from the data frame
-        that were calculated with a specific calculator.
+        Drop descriptors from the given sets from the data frame.
 
         Args:
-            descriptors (list): list of `DescriptorSet` objects or prefixes of
-                descriptors to drop.
+            descriptors (list[DescriptorSet | str]):
+                List of `DescriptorSet` objects or their names. Name of a descriptor
+                set corresponds to the result returned by its `__str__` method.
+            full_removal (bool):
+                Whether to remove the descriptor data (will perform full removal).
+                By default, a soft removal is performed by just rendering the
+                descriptors inactive. A full removal will remove the descriptorSet from the
+                dataset, including the saved files. It is not possible to restore a
+                descriptorSet after a full removal.
         """
         # sanity check
         assert (
@@ -667,18 +698,39 @@ def dropDescriptors(
                 "No descriptors specified to drop. All descriptors will be retained."
             )
             return
-        # convert descriptors to descriptor set names
-        descriptors = [self.generateDescriptorDataSetName(x) for x in descriptors]
-        # drop the descriptors
+        if not isinstance(descriptors[0], str):
+            descriptors = [str(x) for x in descriptors]
+        # remove the descriptors
         to_remove = []
-        for idx, ds in enumerate(self.descriptors):
-            if ds.name in descriptors:
-                logger.info(f"Removing descriptor set: {ds.name}")
-                to_remove.append(idx)
+        to_drop = []
+        for name in descriptors:
+            for idx, ds in enumerate(self.descriptors):
+                calc = ds.calculator
+                if name == str(calc):
+                    to_drop.extend(ds.getDescriptorNames())
+                    if full_removal:
+                        to_remove.append(idx)
+        self.dropDescriptors(to_drop)
         for idx in reversed(to_remove):
             self.descriptors[idx].clearFiles()
             self.descriptors.pop(idx)
 
+    def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]):
+        """Restore descriptors that were previously removed.
+
+        Args:
+            descriptors (list[DescriptorSet | str]):
+                List of `DescriptorSet` objects or their names. Name of a descriptor
+                set corresponds to the result returned by its `__str__` method.
+        """
+        if not isinstance(descriptors[0], str):
+            descriptors = [str(x) for x in descriptors]
+        for name in descriptors:
+            for ds in self.descriptors:
+                calc = ds.calculator
+                if name == str(calc):
+                    ds.restoreDescriptors()
+
     def dropEmptySmiles(self):
         """Drop rows with empty SMILES from the data set."""
         self.df.dropna(subset=[self.smilesCol], inplace=True)
@@ -739,7 +791,7 @@ def addDescriptors(
                 Additional keyword arguments to pass to each descriptor set.
         """
         if recalculate and self.hasDescriptors():
-            self.dropDescriptors(descriptors)
+            self.dropDescriptorSets(descriptors, full_removal=True)
         to_calculate = []
         for desc_set, exists in zip(descriptors, self.hasDescriptors(descriptors)):
             if exists:
@@ -781,24 +833,16 @@ def addDescriptors(
             df_descriptors.loc[self.df.index, self.indexCols] = self.df[self.indexCols]
             self.attachDescriptors(calculator, df_descriptors, [self.idProp])
 
-    def getDescriptors(self):
+    def getDescriptors(self, active_only=False):
         """Get the calculated descriptors as a pandas data frame.
 
         Returns:
             pd.DataFrame: Data frame containing only descriptors.
         """
-        # join_cols = set()
-        # for descriptors in self.descriptors:
-        #     join_cols.update(set(descriptors.indexCols))
-        # join_cols = list(join_cols)
-        # ret = self.df[join_cols].copy()
-        # ret.reset_index(drop=True, inplace=True)
         ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp))
         for descriptors in self.descriptors:
-            df_descriptors = descriptors.getDescriptors()
+            df_descriptors = descriptors.getDescriptors(active_only=active_only)
             ret = ret.join(df_descriptors, how="left")
-        # ret.set_index(self.df.index, inplace=True)
-        # ret.drop(columns=join_cols, inplace=True)
         return ret
 
     def getDescriptorNames(self):

diff --git a/qsprpred/data/tables/qspr.py b/qsprpred/data/tables/qspr.py
@@ -519,6 +519,14 @@ def addDescriptors(
         super().addDescriptors(descriptors, recalculate, *args, **kwargs)
         self.featurize(update_splits=featurize)
 
+    def dropDescriptors(self, descriptors: list[str]):
+        super().dropDescriptors(descriptors)
+        self.featurize(update_splits=True)
+
+    def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]):
+        super().restoreDescriptorSets(descriptors)
+        self.featurize(update_splits=True)
+
     def featurize(self, update_splits=True):
         self.featureNames = self.getFeatureNames()
         if update_splits: