diff --git a/CHANGELOG.md b/CHANGELOG.md index 59cb15da..46d4a97d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,45 +1,39 @@ # Change Log -From v3.0.1 to v3.0.2 +From v3.0.2 to v3.1.0 ## Fixes -- Fixed a bug where an attached standardizer would be refit when calling - `QSPRModel.predictMols` with `use_applicability_domain=True`. -- Fixed a bug with `use_applicability_domain=True` in `QSPRModel.predictMols` - where an error would be raised if there were invalid molecules in the input. -- Fixed a bug where dataset type was not properly set to numeric - in `MlChemADWrapper.contains` - Fixed a bug in `QSPRDataset` where property transformations were not applied. - Fixed a bug where an attached standardizer would be refit when calling `QSPRModel.predictMols` with `use_applicability_domain=True`. - Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`. -- Fixed a bug where class ratios were shuffled in the `RatioDistributionAlgorithm`. ## Changes -- The module containing the sole model base class (`QSPRModel`) was renamed - from `models` to `model`. -- Restrictions on `numpy` versions were removed to allow for more flexibility in - package installations. However, the `BorutaFilter` feature selection method does not - function with `numpy` versions 1.24.0 and above. Therefore, this functionality now - requires a downgrade to `numpy` version 1.23.0 or lower. This was reflected in the - documentation and `numpy` itself outputs a reasonable error message if the version is - incompatible. -- Data type in `MlChemADWrapper` is now set to `float64` by default, instead - of `float32`. -- Saving of models after hyperparameter optimization was improved to ensure parameters - are always propagated to the underlying estimator as well. +- renamed `PandasDataTable.transform` to `PandasDataTable.transformProperties` +- moved `imputeProperties`, `dropEmptyProperties` and `hasProperty` from `MoleculeTable` + to `PandasDataTable`. +- removed `getProperties`, `addProperty`, `removeProperty`, now use `PandasDataTable` + methods directly. +- Since the way descriptors are saved has changed, this release is incompatible with + previous data sets and models. However, these can be easily converted to the new + format by adding + a prefix with descriptor set name to the old descriptor tables. Feel free to contact + us if you require assistance with this. ## New Features -- The `DataFrameDescriptorSet` class was extended to allow more flexibility when joining - custom descriptor sets. -- Added the `prepMols` method to `DescriptorSet` to allow separated customization of - molecule preparation before descriptor calculation. -- The package can now be installed from the PyPI repository 🐍📦. -- New argument (`refit_optimal`) was added to `HyperparameterOptimization.optimize()` - method to make refitting of the model with optimal parameters easier. +- Descriptors are now saved with prefixes to indicate the descriptor sets. This reduces + the chance of name collisions when using multiple descriptor sets. +- Added new methods to `MoleculeTable` and `QSARDataset` for more fine-grained control + of clearing, dropping and restoring of descriptor sets calculated for the dataset. + - `dropDescriptorSets` will drop descriptors associated with the given descriptor + sets. + - `dropDescriptors` will drop individual descriptors associated with the given + descriptor sets and properties. + - All drop actions are restorable with `restoreDescriptorSets` unless explicitly + cleared from the data set with the `clear` parameter of `dropDescriptorSets`. ## Removed Features diff --git a/qsprpred/data/descriptors/fingerprints.py b/qsprpred/data/descriptors/fingerprints.py index c355df38..87d21606 100644 --- a/qsprpred/data/descriptors/fingerprints.py +++ b/qsprpred/data/descriptors/fingerprints.py @@ -72,8 +72,9 @@ def __call__( values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs) values = values[:, self.usedBits] values = values.astype(self.dtype) - df = pd.DataFrame(values, index=props[self.idProp]) - df.columns = self.descriptors + df = pd.DataFrame( + values, index=props[self.idProp], columns=self.transformToFeatureNames() + ) return df diff --git a/qsprpred/data/descriptors/sets.py b/qsprpred/data/descriptors/sets.py index 3b81ba3d..a16ac1af 100644 --- a/qsprpred/data/descriptors/sets.py +++ b/qsprpred/data/descriptors/sets.py @@ -135,9 +135,17 @@ def __call__( Returns: data frame of descriptor values of shape (n_mols, n_descriptors) """ + mols = self.iterMols(mols, to_list=True) values = self.getDescriptors(self.prepMols(mols), props, *args, **kwargs) - df = pd.DataFrame(values, index=props[self.idProp]) - df.columns = self.descriptors + # check if descriptors have unique names + assert len(set(self.descriptors)) == len( + self.descriptors + ), f"Descriptor names are not unique for set '{self}': {self.descriptors}" + df = pd.DataFrame( + values, + index=props[self.idProp], + columns=self.transformToFeatureNames(), + ) try: df = df.astype(self.dtype) except ValueError as exp: @@ -153,6 +161,9 @@ def __call__( ) return df + def transformToFeatureNames(self): + return [f"{self}_{x}" for x in self.descriptors] + @abstractmethod def getDescriptors( self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs diff --git a/qsprpred/data/descriptors/tests.py b/qsprpred/data/descriptors/tests.py index 88fb4f87..8dc563d3 100644 --- a/qsprpred/data/descriptors/tests.py +++ b/qsprpred/data/descriptors/tests.py @@ -27,6 +27,49 @@ def setUp(self): super().setUp() self.setUpPaths() + @staticmethod + def getDescList(): + return [MorganFP(radius=3, nBits=256), DrugExPhyschem()] + + def testDropping(self): + """Test dropping of descriptors from data sets.""" + dataset = self.createLargeTestDataSet("TestDropping") + # test dropping of all sets + dataset.addDescriptors(self.getDescList()) + full_len = sum(len(x) for x in dataset.descriptorSets) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) + dataset.dropDescriptorSets(dataset.descriptorSets) + self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) + dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True) + self.assertEqual(len(dataset.descriptors), 0) + dataset.addDescriptors(self.getDescList()) + dataset.dropDescriptorSets([str(x) for x in self.getDescList()]) + self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) + dataset.dropDescriptorSets( + [str(x) for x in self.getDescList()], full_removal=True + ) + self.assertEqual(len(dataset.descriptors), 0) + # test dropping of single set + dataset.addDescriptors(self.getDescList()) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) + dataset.dropDescriptorSets([dataset.descriptorSets[0]]) + self.assertEqual( + dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) + ) + dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True) + dataset.addDescriptors(self.getDescList()) + dataset.dropDescriptorSets([str(dataset.descriptorSets[0])], full_removal=True) + self.assertEqual( + dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) + ) + # test restoring of dropped sets + dataset.addDescriptors(self.getDescList()) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) + dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=False) + self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) + dataset.restoreDescriptorSets(dataset.descriptorSets) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) + @parameterized.expand([(None, None), (1, None), (2, None), (4, 50)]) def testSwitching(self, n_cpu, chunk_size): """Test if the feature calculator can be switched to a new dataset.""" @@ -163,14 +206,16 @@ def setUp(self): super().setUp() self.setUpPaths() - @parameterized.expand([ - ( - f"{desc_set}_{TargetTasks.REGRESSION}", - desc_set, - [{"name": "CL", "task": TargetTasks.REGRESSION}], - ) - for desc_set in DataSetsPathMixIn.getAllDescriptors() - ]) + @parameterized.expand( + [ + ( + f"{desc_set}_{TargetTasks.REGRESSION}", + desc_set, + [{"name": "CL", "task": TargetTasks.REGRESSION}], + ) + for desc_set in DataSetsPathMixIn.getAllDescriptors() + ] + ) def testDescriptorsAll(self, _, desc_set, target_props): """Tests all available descriptor sets. diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py index 7f225ded..29c2609e 100644 --- a/qsprpred/data/processing/tests.py +++ b/qsprpred/data/processing/tests.py @@ -156,7 +156,7 @@ def setUp(self): self.descriptors = self.dataset.featureNames def recalculateWithMultiIndex(self): - self.dataset.dropDescriptors(self.dataset.descriptorSets) + self.dataset.dropDescriptorSets(self.dataset.descriptorSets, full_removal=True) self.df_descriptors["ID_COL1"] = ( self.dataset.getProperty(self.dataset.idProp) .apply(lambda x: x.split("_")[0]) @@ -178,13 +178,12 @@ def recalculateWithMultiIndex(self): ] ) - # def testDefaultDescriptorAdd(self): - # """Test adding without index columns.""" - # # TODO: issue 88 needs to be solved for this to work - # self.dataset.nJobs = 1 - # df_new = self.dataset.getFeatures(concat=True).copy() - # calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc") - # self.dataset.addDescriptors([calc]) + def testDefaultDescriptorAdd(self): + """Test adding without index columns.""" + self.dataset.nJobs = 1 + df_new = self.dataset.getFeatures(concat=True).copy() + calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc") + self.dataset.addDescriptors([calc]) @parameterized.expand( [ diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index 6a96a18c..1a7a8560 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -107,11 +107,10 @@ def getDescriptorNames(self, active_only=True): the current descriptor set. Defaults to `True`. """ - all_descs = self.df.columns[~self.df.columns.isin(self.indexCols)].tolist() if active_only: - return [x for x in all_descs if x in self.calculator.descriptors] + return self.calculator.transformToFeatureNames() else: - return all_descs + return self.df.columns[~self.df.columns.isin(self.indexCols)].tolist() def fillMissing(self, fill_value, names): """Fill missing values in the descriptor table. @@ -138,8 +137,20 @@ def keepDescriptors(self, descriptors: list[str]) -> list[str]: """ all_descs = self.getDescriptorNames(active_only=False) to_keep = set(all_descs) & set(descriptors) + prefix = str(self.calculator) + "_" self.calculator.descriptors = [ - x for x in self.calculator.descriptors if x in to_keep + x.replace(prefix, "", 1) # remove prefix + for x in self.calculator.transformToFeatureNames() + if x in to_keep + ] + return self.getDescriptorNames() + + def restoreDescriptors(self) -> list[str]: + """Restore all descriptors to active in this set.""" + all_descs = self.getDescriptorNames(active_only=False) + prefix = str(self.calculator) + "_" + self.calculator.descriptors = [ + x.replace(prefix, "", 1) for x in all_descs # remove prefix ] return self.getDescriptorNames() @@ -646,17 +657,37 @@ def generateDescriptorDataSetName(self, ds_set: str | DescriptorSet): """Generate a descriptor set name from a descriptor set.""" return f"Descriptors_{self.name}_{ds_set}" - def dropDescriptors( + def dropDescriptors(self, descriptors: list[str]): + """Drop descriptors by name. Performs a simple feature selection by removing + the given descriptor names from the data set. + + Args: + descriptors (list[str]): List of descriptor names to drop. + """ + for ds in self.descriptors: + calc = ds.calculator + ds_names = calc.transformToFeatureNames() + to_keep = [x for x in ds_names if x not in descriptors] + ds.keepDescriptors(to_keep) + + def dropDescriptorSets( self, - descriptors: list[DescriptorSet] | list[str], + descriptors: list[DescriptorSet | str], + full_removal: bool = False, ): """ - Drop descriptors from the data frame - that were calculated with a specific calculator. + Drop descriptors from the given sets from the data frame. Args: - descriptors (list): list of `DescriptorSet` objects or prefixes of - descriptors to drop. + descriptors (list[DescriptorSet | str]): + List of `DescriptorSet` objects or their names. Name of a descriptor + set corresponds to the result returned by its `__str__` method. + full_removal (bool): + Whether to remove the descriptor data (will perform full removal). + By default, a soft removal is performed by just rendering the + descriptors inactive. A full removal will remove the descriptorSet from the + dataset, including the saved files. It is not possible to restore a + descriptorSet after a full removal. """ # sanity check assert ( @@ -667,18 +698,39 @@ def dropDescriptors( "No descriptors specified to drop. All descriptors will be retained." ) return - # convert descriptors to descriptor set names - descriptors = [self.generateDescriptorDataSetName(x) for x in descriptors] - # drop the descriptors + if not isinstance(descriptors[0], str): + descriptors = [str(x) for x in descriptors] + # remove the descriptors to_remove = [] - for idx, ds in enumerate(self.descriptors): - if ds.name in descriptors: - logger.info(f"Removing descriptor set: {ds.name}") - to_remove.append(idx) + to_drop = [] + for name in descriptors: + for idx, ds in enumerate(self.descriptors): + calc = ds.calculator + if name == str(calc): + to_drop.extend(ds.getDescriptorNames()) + if full_removal: + to_remove.append(idx) + self.dropDescriptors(to_drop) for idx in reversed(to_remove): self.descriptors[idx].clearFiles() self.descriptors.pop(idx) + def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]): + """Restore descriptors that were previously removed. + + Args: + descriptors (list[DescriptorSet | str]): + List of `DescriptorSet` objects or their names. Name of a descriptor + set corresponds to the result returned by its `__str__` method. + """ + if not isinstance(descriptors[0], str): + descriptors = [str(x) for x in descriptors] + for name in descriptors: + for ds in self.descriptors: + calc = ds.calculator + if name == str(calc): + ds.restoreDescriptors() + def dropEmptySmiles(self): """Drop rows with empty SMILES from the data set.""" self.df.dropna(subset=[self.smilesCol], inplace=True) @@ -739,7 +791,7 @@ def addDescriptors( Additional keyword arguments to pass to each descriptor set. """ if recalculate and self.hasDescriptors(): - self.dropDescriptors(descriptors) + self.dropDescriptorSets(descriptors, full_removal=True) to_calculate = [] for desc_set, exists in zip(descriptors, self.hasDescriptors(descriptors)): if exists: @@ -781,24 +833,16 @@ def addDescriptors( df_descriptors.loc[self.df.index, self.indexCols] = self.df[self.indexCols] self.attachDescriptors(calculator, df_descriptors, [self.idProp]) - def getDescriptors(self): + def getDescriptors(self, active_only=False): """Get the calculated descriptors as a pandas data frame. Returns: pd.DataFrame: Data frame containing only descriptors. """ - # join_cols = set() - # for descriptors in self.descriptors: - # join_cols.update(set(descriptors.indexCols)) - # join_cols = list(join_cols) - # ret = self.df[join_cols].copy() - # ret.reset_index(drop=True, inplace=True) ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp)) for descriptors in self.descriptors: - df_descriptors = descriptors.getDescriptors() + df_descriptors = descriptors.getDescriptors(active_only=active_only) ret = ret.join(df_descriptors, how="left") - # ret.set_index(self.df.index, inplace=True) - # ret.drop(columns=join_cols, inplace=True) return ret def getDescriptorNames(self): diff --git a/qsprpred/data/tables/qspr.py b/qsprpred/data/tables/qspr.py index afa2814b..b79b212e 100644 --- a/qsprpred/data/tables/qspr.py +++ b/qsprpred/data/tables/qspr.py @@ -519,6 +519,14 @@ def addDescriptors( super().addDescriptors(descriptors, recalculate, *args, **kwargs) self.featurize(update_splits=featurize) + def dropDescriptors(self, descriptors: list[str]): + super().dropDescriptors(descriptors) + self.featurize(update_splits=True) + + def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]): + super().restoreDescriptorSets(descriptors) + self.featurize(update_splits=True) + def featurize(self, update_splits=True): self.featureNames = self.getFeatureNames() if update_splits: diff --git a/qsprpred/extra/data/descriptors/sets.py b/qsprpred/extra/data/descriptors/sets.py index d62f89ce..d66102b1 100644 --- a/qsprpred/extra/data/descriptors/sets.py +++ b/qsprpred/extra/data/descriptors/sets.py @@ -396,8 +396,9 @@ def getDescriptors( # create a data frame with the same order of acc_keys as in props df = pd.DataFrame({"acc_keys": props["acc_keys"]}) # merge the calculated values with the data frame to attach them to the rows - df = df.merge(values, left_on="acc_keys", right_on="acc_keys", - how="left").set_index("acc_keys") + df = df.merge( + values, left_on="acc_keys", right_on="acc_keys", how="left" + ).set_index("acc_keys") return df.values @property diff --git a/qsprpred/extra/data/descriptors/tests.py b/qsprpred/extra/data/descriptors/tests.py index 9b413769..d5c8c445 100644 --- a/qsprpred/extra/data/descriptors/tests.py +++ b/qsprpred/extra/data/descriptors/tests.py @@ -173,7 +173,7 @@ def testSerialization(self, _, msa_provider_cls: Type[BioPythonMSA]): self.assertTrue(len(dataset_new.featureNames) == len(self.sampleDescSet)) self.assertTrue(all(mol_id in dataset_new.X_ind.index for mol_id in test_ids)) self.assertTrue(all(mol_id in dataset_new.y_ind.index for mol_id in train_ids)) - # clear files and try saving again + # full_removal files and try saving again dataset_new.clearFiles() dataset_new.save() diff --git a/qsprpred/extra/gpu/models/chemprop.py b/qsprpred/extra/gpu/models/chemprop.py index ca093150..101547fc 100644 --- a/qsprpred/extra/gpu/models/chemprop.py +++ b/qsprpred/extra/gpu/models/chemprop.py @@ -605,9 +605,9 @@ def convertToMoleculeDataset( # find which column contains the SMILES strings prev_len = 0 for calc in self.featureCalculators: - names = calc.descriptors - if "SMILES" in names: - smiles_column = names.index("SMILES") + prev_len + names = calc.transformToFeatureNames() + if f"{calc}_SMILES" in names: + smiles_column = names.index(f"{calc}_SMILES") + prev_len break else: prev_len += len(names) diff --git a/qsprpred/extra/gpu/models/tests.py b/qsprpred/extra/gpu/models/tests.py index 1c22ec83..6cd00f04 100644 --- a/qsprpred/extra/gpu/models/tests.py +++ b/qsprpred/extra/gpu/models/tests.py @@ -404,7 +404,7 @@ def testConsistency(self): ) df_test = pd.DataFrame(assessor.monitor.foldData[0]["X_test"]) - df_test.rename(columns={"Descriptor_SmilesDesc_SMILES": "SMILES"}, inplace=True) + df_test.rename(columns={"SmilesDesc_SMILES": "SMILES"}, inplace=True) df_test["pchembl_value_Mean"] = assessor.monitor.foldData[0]["y_test"] df_test.to_csv( f"{self.generatedModelsPath}/consistency_data_test.csv", index=False diff --git a/qsprpred/utils/parallel.py b/qsprpred/utils/parallel.py index 03b3ea22..531d1654 100644 --- a/qsprpred/utils/parallel.py +++ b/qsprpred/utils/parallel.py @@ -106,7 +106,7 @@ def parallel_jit_generator( kwargs )) except StopIteration: - # no more data, clear out the slice generator + # no more data, full_removal out the slice generator done = True # wait for a free worker or until all remaining workers finish logger.debug(f"Waiting for {len(queue)} workers to finish...") diff --git a/tutorials/advanced/data/parallelization.ipynb b/tutorials/advanced/data/parallelization.ipynb index 1d10eb5d..d56f7703 100644 --- a/tutorials/advanced/data/parallelization.ipynb +++ b/tutorials/advanced/data/parallelization.ipynb @@ -138,7 +138,7 @@ " \"\"\"\n", " if data.hasDescriptors([desc_set])[0]:\n", " print(f\"Removing old descriptors: {desc_set}\")\n", - " data.dropDescriptors([desc_set])\n", + " data.dropDescriptorSets([desc_set], full_removal=True)\n", " print(f\"Running and timing descriptor calculation: {desc_set}\")\n", " watch = StopWatch()\n", " data.addDescriptors([desc_set])\n",