From 8db72d77c20bbd0b0f05c89103b7c54689dd05a3 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 13:29:28 +0100 Subject: [PATCH 01/12] add descriptor set prefixes to descriptor names --- qsprpred/data/descriptors/fingerprints.py | 5 +++-- qsprpred/data/descriptors/sets.py | 14 ++++++++++++-- qsprpred/data/processing/tests.py | 13 ++++++------- qsprpred/data/tables/mol.py | 18 ++++++------------ qsprpred/extra/data/descriptors/sets.py | 5 +++-- 5 files changed, 30 insertions(+), 25 deletions(-) diff --git a/qsprpred/data/descriptors/fingerprints.py b/qsprpred/data/descriptors/fingerprints.py index 79724843..090a4d9c 100644 --- a/qsprpred/data/descriptors/fingerprints.py +++ b/qsprpred/data/descriptors/fingerprints.py @@ -66,8 +66,9 @@ def __call__( values = self.getDescriptors(mols, props, *args, **kwargs) values = values[:, self.usedBits] values = values.astype(self.dtype) - df = pd.DataFrame(values, index=props[self.idProp]) - df.columns = self.descriptors + df = pd.DataFrame( + values, index=props[self.idProp], columns=self.transformToFeatureNames() + ) return df diff --git a/qsprpred/data/descriptors/sets.py b/qsprpred/data/descriptors/sets.py index 076160ed..61ec8f3c 100644 --- a/qsprpred/data/descriptors/sets.py +++ b/qsprpred/data/descriptors/sets.py @@ -130,8 +130,15 @@ def __call__( """ mols = self.iterMols(mols, to_list=True) values = self.getDescriptors(mols, props, *args, **kwargs) - df = pd.DataFrame(values, index=props[self.idProp]) - df.columns = self.descriptors + # check if descriptors have unique names + assert len(set(self.descriptors)) == len( + self.descriptors + ), f"Descriptor names are not unique for set '{self}': {self.descriptors}" + df = pd.DataFrame( + values, + index=props[self.idProp], + columns=self.transformToFeatureNames(), + ) try: df = df.astype(self.dtype) except ValueError as exp: @@ -147,6 +154,9 @@ def __call__( ) return df + def transformToFeatureNames(self): + return [f"{self}_{x}" for x in self.descriptors] + @abstractmethod def getDescriptors( self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py index 0d5a3e7a..8cac8b23 100644 --- a/qsprpred/data/processing/tests.py +++ b/qsprpred/data/processing/tests.py @@ -174,13 +174,12 @@ def recalculateWithMultiIndex(self): ] ) - # def testDefaultDescriptorAdd(self): - # """Test adding without index columns.""" - # # TODO: issue 88 needs to be solved for this to work - # self.dataset.nJobs = 1 - # df_new = self.dataset.getFeatures(concat=True).copy() - # calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc") - # self.dataset.addDescriptors([calc]) + def testDefaultDescriptorAdd(self): + """Test adding without index columns.""" + self.dataset.nJobs = 1 + df_new = self.dataset.getFeatures(concat=True).copy() + calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc") + self.dataset.addDescriptors([calc]) @parameterized.expand( [ diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index 6a96a18c..e2efb836 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -107,11 +107,10 @@ def getDescriptorNames(self, active_only=True): the current descriptor set. Defaults to `True`. """ - all_descs = self.df.columns[~self.df.columns.isin(self.indexCols)].tolist() if active_only: - return [x for x in all_descs if x in self.calculator.descriptors] + return self.calculator.transformToFeatureNames() else: - return all_descs + return self.df.columns[~self.df.columns.isin(self.indexCols)].tolist() def fillMissing(self, fill_value, names): """Fill missing values in the descriptor table. @@ -138,8 +137,11 @@ def keepDescriptors(self, descriptors: list[str]) -> list[str]: """ all_descs = self.getDescriptorNames(active_only=False) to_keep = set(all_descs) & set(descriptors) + prefix = str(self.calculator) + "_" self.calculator.descriptors = [ - x for x in self.calculator.descriptors if x in to_keep + x.replace(prefix, "", 1) # remove prefix + for x in self.calculator.transformToFeatureNames() + if x in to_keep ] return self.getDescriptorNames() @@ -787,18 +789,10 @@ def getDescriptors(self): Returns: pd.DataFrame: Data frame containing only descriptors. """ - # join_cols = set() - # for descriptors in self.descriptors: - # join_cols.update(set(descriptors.indexCols)) - # join_cols = list(join_cols) - # ret = self.df[join_cols].copy() - # ret.reset_index(drop=True, inplace=True) ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp)) for descriptors in self.descriptors: df_descriptors = descriptors.getDescriptors() ret = ret.join(df_descriptors, how="left") - # ret.set_index(self.df.index, inplace=True) - # ret.drop(columns=join_cols, inplace=True) return ret def getDescriptorNames(self): diff --git a/qsprpred/extra/data/descriptors/sets.py b/qsprpred/extra/data/descriptors/sets.py index d62f89ce..d66102b1 100644 --- a/qsprpred/extra/data/descriptors/sets.py +++ b/qsprpred/extra/data/descriptors/sets.py @@ -396,8 +396,9 @@ def getDescriptors( # create a data frame with the same order of acc_keys as in props df = pd.DataFrame({"acc_keys": props["acc_keys"]}) # merge the calculated values with the data frame to attach them to the rows - df = df.merge(values, left_on="acc_keys", right_on="acc_keys", - how="left").set_index("acc_keys") + df = df.merge( + values, left_on="acc_keys", right_on="acc_keys", how="left" + ).set_index("acc_keys") return df.values @property From 1dbb05318d01b25681f0be8a89a33c690b6b9636 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 15:37:39 +0100 Subject: [PATCH 02/12] fix chemprop --- qsprpred/extra/gpu/models/chemprop.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qsprpred/extra/gpu/models/chemprop.py b/qsprpred/extra/gpu/models/chemprop.py index 43ccbb04..f46e37ec 100644 --- a/qsprpred/extra/gpu/models/chemprop.py +++ b/qsprpred/extra/gpu/models/chemprop.py @@ -600,9 +600,9 @@ def convertToMoleculeDataset( # find which column contains the SMILES strings prev_len = 0 for calc in self.featureCalculators: - names = calc.descriptors - if "SMILES" in names: - smiles_column = names.index("SMILES") + prev_len + names = calc.transformToFeatureNames() + if f"{calc}_SMILES" in names: + smiles_column = names.index(f"{calc}_SMILES") + prev_len break else: prev_len += len(names) From cb225289b175f2ccbccddb40b6f80789dc6e492b Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 15:45:59 +0100 Subject: [PATCH 03/12] fix test --- qsprpred/extra/gpu/models/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qsprpred/extra/gpu/models/tests.py b/qsprpred/extra/gpu/models/tests.py index 3e7b68bf..cd431c5e 100644 --- a/qsprpred/extra/gpu/models/tests.py +++ b/qsprpred/extra/gpu/models/tests.py @@ -405,7 +405,7 @@ def testConsistency(self): ) df_test = pd.DataFrame(assessor.monitor.foldData[0]["X_test"]) - df_test.rename(columns={"Descriptor_SmilesDesc_SMILES": "SMILES"}, inplace=True) + df_test.rename(columns={"SmilesDesc_SMILES": "SMILES"}, inplace=True) df_test["pchembl_value_Mean"] = assessor.monitor.foldData[0]["y_test"] df_test.to_csv( f"{self.generatedModelsPath}/consistency_data_test.csv", index=False From a10ddcfce8320829b25e5e22a5aa959c2aeee914 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 15:46:30 +0100 Subject: [PATCH 04/12] more comprehensive code to drop descriptor sets --- qsprpred/data/descriptors/tests.py | 51 ++++++++++++++++--- qsprpred/data/processing/tests.py | 2 +- qsprpred/data/tables/mol.py | 51 ++++++++++++++----- qsprpred/data/tables/qspr.py | 4 ++ tutorials/advanced/data/parallelization.ipynb | 2 +- 5 files changed, 86 insertions(+), 24 deletions(-) diff --git a/qsprpred/data/descriptors/tests.py b/qsprpred/data/descriptors/tests.py index 88fb4f87..313819f4 100644 --- a/qsprpred/data/descriptors/tests.py +++ b/qsprpred/data/descriptors/tests.py @@ -27,6 +27,39 @@ def setUp(self): super().setUp() self.setUpPaths() + @staticmethod + def getDescList(): + return [MorganFP(radius=3, nBits=256), DrugExPhyschem()] + + def testDropping(self): + """Test dropping of descriptors from data sets.""" + dataset = self.createLargeTestDataSet("TestDropping") + # test dropping of all sets + dataset.addDescriptors(self.getDescList()) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] > 0) + dataset.dropDescriptorSets(dataset.descriptorSets) + self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) + dataset.dropDescriptorSets(dataset.descriptorSets, clear=True) + self.assertEqual(len(dataset.descriptors), 0) + dataset.addDescriptors(self.getDescList()) + dataset.dropDescriptorSets([str(x) for x in self.getDescList()]) + self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) + dataset.dropDescriptorSets([str(x) for x in self.getDescList()], clear=True) + self.assertEqual(len(dataset.descriptors), 0) + # test dropping of single set + dataset.addDescriptors(self.getDescList()) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] > 0) + dataset.dropDescriptorSets([dataset.descriptorSets[0]]) + self.assertEqual( + dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) + ) + dataset.dropDescriptorSets(dataset.descriptorSets, clear=True) + dataset.addDescriptors(self.getDescList()) + dataset.dropDescriptorSets([str(dataset.descriptorSets[0])], clear=True) + self.assertEqual( + dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) + ) + @parameterized.expand([(None, None), (1, None), (2, None), (4, 50)]) def testSwitching(self, n_cpu, chunk_size): """Test if the feature calculator can be switched to a new dataset.""" @@ -163,14 +196,16 @@ def setUp(self): super().setUp() self.setUpPaths() - @parameterized.expand([ - ( - f"{desc_set}_{TargetTasks.REGRESSION}", - desc_set, - [{"name": "CL", "task": TargetTasks.REGRESSION}], - ) - for desc_set in DataSetsPathMixIn.getAllDescriptors() - ]) + @parameterized.expand( + [ + ( + f"{desc_set}_{TargetTasks.REGRESSION}", + desc_set, + [{"name": "CL", "task": TargetTasks.REGRESSION}], + ) + for desc_set in DataSetsPathMixIn.getAllDescriptors() + ] + ) def testDescriptorsAll(self, _, desc_set, target_props): """Tests all available descriptor sets. diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py index 8cac8b23..1e34b92e 100644 --- a/qsprpred/data/processing/tests.py +++ b/qsprpred/data/processing/tests.py @@ -152,7 +152,7 @@ def setUp(self): self.descriptors = self.dataset.featureNames def recalculateWithMultiIndex(self): - self.dataset.dropDescriptors(self.dataset.descriptorSets) + self.dataset.dropDescriptorSets(self.dataset.descriptorSets) self.df_descriptors["ID_COL1"] = ( self.dataset.getProperty(self.dataset.idProp) .apply(lambda x: x.split("_")[0]) diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index e2efb836..118b21ed 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -648,17 +648,35 @@ def generateDescriptorDataSetName(self, ds_set: str | DescriptorSet): """Generate a descriptor set name from a descriptor set.""" return f"Descriptors_{self.name}_{ds_set}" - def dropDescriptors( + def dropDescriptors(self, descriptors: list[str]): + """Drop descriptors by name. Performs a simple feature selection by removing + the given descriptor names from the data set. + + Args: + descriptors (list[str]): List of descriptor names to drop. + """ + for ds in self.descriptors: + calc = ds.calculator + ds_names = calc.transformToFeatureNames() + to_keep = [x for x in ds_names if x not in descriptors] + ds.keepDescriptors(to_keep) + + def dropDescriptorSets( self, - descriptors: list[DescriptorSet] | list[str], + descriptors: list[DescriptorSet | str], + clear: bool = False, ): """ - Drop descriptors from the data frame - that were calculated with a specific calculator. + Drop descriptors from the given sets from the data frame. Args: - descriptors (list): list of `DescriptorSet` objects or prefixes of - descriptors to drop. + descriptors (list[DescriptorSet | str]): + List of `DescriptorSet` objects or their names. Name of a descriptor + set corresponds to the result returned by its `__str__` method. + clear (bool): + Whether to remove the descriptor data (will perform full removal). + By default, a soft removal is performed by just rendering the + descriptors inactive. """ # sanity check assert ( @@ -669,14 +687,19 @@ def dropDescriptors( "No descriptors specified to drop. All descriptors will be retained." ) return - # convert descriptors to descriptor set names - descriptors = [self.generateDescriptorDataSetName(x) for x in descriptors] - # drop the descriptors + if not isinstance(descriptors[0], str): + descriptors = [str(x) for x in descriptors] + # remove the descriptors to_remove = [] - for idx, ds in enumerate(self.descriptors): - if ds.name in descriptors: - logger.info(f"Removing descriptor set: {ds.name}") - to_remove.append(idx) + to_drop = [] + for name in descriptors: + for idx, ds in enumerate(self.descriptors): + calc = ds.calculator + if name == str(calc): + to_drop.extend(ds.getDescriptorNames()) + if clear: + to_remove.append(idx) + self.dropDescriptors(to_drop) for idx in reversed(to_remove): self.descriptors[idx].clearFiles() self.descriptors.pop(idx) @@ -741,7 +764,7 @@ def addDescriptors( Additional keyword arguments to pass to each descriptor set. """ if recalculate and self.hasDescriptors(): - self.dropDescriptors(descriptors) + self.dropDescriptorSets(descriptors) to_calculate = [] for desc_set, exists in zip(descriptors, self.hasDescriptors(descriptors)): if exists: diff --git a/qsprpred/data/tables/qspr.py b/qsprpred/data/tables/qspr.py index 39a5ffb4..75246b81 100644 --- a/qsprpred/data/tables/qspr.py +++ b/qsprpred/data/tables/qspr.py @@ -517,6 +517,10 @@ def addDescriptors( super().addDescriptors(descriptors, recalculate, *args, **kwargs) self.featurize(update_splits=featurize) + def dropDescriptors(self, descriptors: list[str]): + super().dropDescriptors(descriptors) + self.featurize(update_splits=True) + def featurize(self, update_splits=True): self.featureNames = self.getFeatureNames() if update_splits: diff --git a/tutorials/advanced/data/parallelization.ipynb b/tutorials/advanced/data/parallelization.ipynb index 29aa965e..a864aa0b 100644 --- a/tutorials/advanced/data/parallelization.ipynb +++ b/tutorials/advanced/data/parallelization.ipynb @@ -147,7 +147,7 @@ " \"\"\"\n", " if data.hasDescriptors([desc_set])[0]:\n", " print(f\"Removing old descriptors: {desc_set}\")\n", - " data.dropDescriptors([desc_set])\n", + " data.dropDescriptorSets([desc_set])\n", " print(f\"Running and timing descriptor calculation: {desc_set}\")\n", " watch = StopWatch()\n", " data.addDescriptors([desc_set])\n", From 0f5b5d56cdc32fa5fff06cdf2285c9006ff81b6e Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 15:51:58 +0100 Subject: [PATCH 05/12] make it possible to get active descriptors only for molecule table --- qsprpred/data/tables/mol.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index 118b21ed..ed4bcdb6 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -806,7 +806,7 @@ def addDescriptors( df_descriptors.loc[self.df.index, self.indexCols] = self.df[self.indexCols] self.attachDescriptors(calculator, df_descriptors, [self.idProp]) - def getDescriptors(self): + def getDescriptors(self, active_only=False): """Get the calculated descriptors as a pandas data frame. Returns: @@ -814,7 +814,7 @@ def getDescriptors(self): """ ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp)) for descriptors in self.descriptors: - df_descriptors = descriptors.getDescriptors() + df_descriptors = descriptors.getDescriptors(active_only=active_only) ret = ret.join(df_descriptors, how="left") return ret From 6219c43e6dabe9bca200c82bb64965c6ec56f049 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 16:07:10 +0100 Subject: [PATCH 06/12] add restore option --- qsprpred/data/descriptors/tests.py | 12 ++++++++++-- qsprpred/data/tables/mol.py | 25 +++++++++++++++++++++++++ qsprpred/data/tables/qspr.py | 4 ++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/qsprpred/data/descriptors/tests.py b/qsprpred/data/descriptors/tests.py index 313819f4..68376997 100644 --- a/qsprpred/data/descriptors/tests.py +++ b/qsprpred/data/descriptors/tests.py @@ -36,7 +36,8 @@ def testDropping(self): dataset = self.createLargeTestDataSet("TestDropping") # test dropping of all sets dataset.addDescriptors(self.getDescList()) - self.assertTrue(dataset.getFeatures(concat=True).shape[1] > 0) + full_len = sum(len(x) for x in dataset.descriptorSets) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) dataset.dropDescriptorSets(dataset.descriptorSets) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) dataset.dropDescriptorSets(dataset.descriptorSets, clear=True) @@ -48,7 +49,7 @@ def testDropping(self): self.assertEqual(len(dataset.descriptors), 0) # test dropping of single set dataset.addDescriptors(self.getDescList()) - self.assertTrue(dataset.getFeatures(concat=True).shape[1] > 0) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) dataset.dropDescriptorSets([dataset.descriptorSets[0]]) self.assertEqual( dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) @@ -59,6 +60,13 @@ def testDropping(self): self.assertEqual( dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) ) + # test restoring of dropped sets + dataset.addDescriptors(self.getDescList()) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) + dataset.dropDescriptorSets(dataset.descriptorSets, clear=False) + self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) + dataset.restoreDescriptorSets(dataset.descriptorSets) + self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) @parameterized.expand([(None, None), (1, None), (2, None), (4, 50)]) def testSwitching(self, n_cpu, chunk_size): diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index ed4bcdb6..fe8e1e2f 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -145,6 +145,15 @@ def keepDescriptors(self, descriptors: list[str]) -> list[str]: ] return self.getDescriptorNames() + def restoreDescriptors(self) -> list[str]: + """Restore all descriptors to active in this set.""" + all_descs = self.getDescriptorNames(active_only=False) + prefix = str(self.calculator) + "_" + self.calculator.descriptors = [ + x.replace(prefix, "", 1) for x in all_descs # remove prefix + ] + return self.getDescriptorNames() + class MoleculeTable(PandasDataTable, SearchableMolTable, Summarizable): """Class that holds and prepares molecule data for modelling and other analyses. @@ -704,6 +713,22 @@ def dropDescriptorSets( self.descriptors[idx].clearFiles() self.descriptors.pop(idx) + def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]): + """Restore descriptors that were previously removed. + + Args: + descriptors (list[DescriptorSet | str]): + List of `DescriptorSet` objects or their names. Name of a descriptor + set corresponds to the result returned by its `__str__` method. + """ + if not isinstance(descriptors[0], str): + descriptors = [str(x) for x in descriptors] + for name in descriptors: + for ds in self.descriptors: + calc = ds.calculator + if name == str(calc): + ds.restoreDescriptors() + def dropEmptySmiles(self): """Drop rows with empty SMILES from the data set.""" self.df.dropna(subset=[self.smilesCol], inplace=True) diff --git a/qsprpred/data/tables/qspr.py b/qsprpred/data/tables/qspr.py index 75246b81..ba20607e 100644 --- a/qsprpred/data/tables/qspr.py +++ b/qsprpred/data/tables/qspr.py @@ -521,6 +521,10 @@ def dropDescriptors(self, descriptors: list[str]): super().dropDescriptors(descriptors) self.featurize(update_splits=True) + def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]): + super().restoreDescriptorSets(descriptors) + self.featurize(update_splits=True) + def featurize(self, update_splits=True): self.featureNames = self.getFeatureNames() if update_splits: From f5de2826e148991ee0b5dd2868fdf810afe37b83 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 16:12:34 +0100 Subject: [PATCH 07/12] make sure to clear descriptors in code that requires it --- qsprpred/data/processing/tests.py | 2 +- qsprpred/data/tables/mol.py | 2 +- tutorials/advanced/data/parallelization.ipynb | 157 ++++-------------- 3 files changed, 34 insertions(+), 127 deletions(-) diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py index 1e34b92e..6f112017 100644 --- a/qsprpred/data/processing/tests.py +++ b/qsprpred/data/processing/tests.py @@ -152,7 +152,7 @@ def setUp(self): self.descriptors = self.dataset.featureNames def recalculateWithMultiIndex(self): - self.dataset.dropDescriptorSets(self.dataset.descriptorSets) + self.dataset.dropDescriptorSets(self.dataset.descriptorSets, clear=True) self.df_descriptors["ID_COL1"] = ( self.dataset.getProperty(self.dataset.idProp) .apply(lambda x: x.split("_")[0]) diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index fe8e1e2f..d4f10c83 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -789,7 +789,7 @@ def addDescriptors( Additional keyword arguments to pass to each descriptor set. """ if recalculate and self.hasDescriptors(): - self.dropDescriptorSets(descriptors) + self.dropDescriptorSets(descriptors, clear=True) to_calculate = [] for desc_set, exists in zip(descriptors, self.hasDescriptors(descriptors)): if exists: diff --git a/tutorials/advanced/data/parallelization.ipynb b/tutorials/advanced/data/parallelization.ipynb index a864aa0b..ad5992f4 100644 --- a/tutorials/advanced/data/parallelization.ipynb +++ b/tutorials/advanced/data/parallelization.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "id": "9fedcee856268b35", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "# Code Parallelization\n", @@ -77,10 +74,7 @@ "cell_type": "markdown", "id": "8f9ffda3a4b8202f", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "## Setting `nJobs` and `chunkSize`\n", @@ -117,10 +111,7 @@ "end_time": "2024-01-16T16:30:51.361058064Z", "start_time": "2024-01-16T16:30:47.131517756Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -147,7 +138,7 @@ " \"\"\"\n", " if data.hasDescriptors([desc_set])[0]:\n", " print(f\"Removing old descriptors: {desc_set}\")\n", - " data.dropDescriptorSets([desc_set])\n", + " data.dropDescriptorSets([desc_set], clear=True)\n", " print(f\"Running and timing descriptor calculation: {desc_set}\")\n", " watch = StopWatch()\n", " data.addDescriptors([desc_set])\n", @@ -161,10 +152,7 @@ "cell_type": "markdown", "id": "9357f12c0516b989", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This calculation is done on one CPU by default:" @@ -179,10 +167,7 @@ "end_time": "2024-01-16T16:30:51.368391209Z", "start_time": "2024-01-16T16:30:51.363595085Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -204,10 +189,7 @@ "cell_type": "markdown", "id": "e7e51a9829413df0", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "and the whole data set supplied as one chunk:" @@ -222,10 +204,7 @@ "end_time": "2024-01-16T16:30:51.372183338Z", "start_time": "2024-01-16T16:30:51.367032511Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -247,10 +226,7 @@ "cell_type": "markdown", "id": "d28c75dc19273bed", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can now try running this calculation in parallel on 2 CPUs:" @@ -265,10 +241,7 @@ "end_time": "2024-01-16T16:30:51.379969255Z", "start_time": "2024-01-16T16:30:51.375227876Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [], "source": [ @@ -279,10 +252,7 @@ "cell_type": "markdown", "id": "6bc6ee9045cc5f12", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The chunk size will automatically be adjusted to 25% of the data set size so that each portion of the data set is processed on a separate CPU:" @@ -297,10 +267,7 @@ "end_time": "2024-01-16T16:30:51.411732902Z", "start_time": "2024-01-16T16:30:51.378238063Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -322,10 +289,7 @@ "cell_type": "markdown", "id": "2e21998b62ee78bf", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can see how this affects the time taken to run the calculation:" @@ -340,10 +304,7 @@ "end_time": "2024-01-16T16:30:53.084658845Z", "start_time": "2024-01-16T16:30:51.383586975Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -364,10 +325,7 @@ "cell_type": "markdown", "id": "bc5243c149010a23", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This was faster, but not by a factor of 4. This is because there is some overhead associated with parallelization and the calculation of fingerprints is very fast by itself so the overhead affects our runtime more. In such cases, be careful about setting the chunk size manually:" @@ -382,10 +340,7 @@ "end_time": "2024-01-16T16:31:10.073558913Z", "start_time": "2024-01-16T16:30:53.083216365Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -407,10 +362,7 @@ "cell_type": "markdown", "id": "c9fdc32aa83072e6", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This was slower than even the single CPU calculation!" @@ -420,10 +372,7 @@ "cell_type": "markdown", "id": "7c2367dd655da9c8", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "## Custom Operations\n", @@ -440,10 +389,7 @@ "end_time": "2024-01-16T16:31:10.082418114Z", "start_time": "2024-01-16T16:31:10.077838705Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -474,10 +420,7 @@ "cell_type": "markdown", "id": "3ada92396624b990", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "As you can see, this gives us a generator object. In order to run the function on each chunk and get the results, we need to iterate over the generator and collect results:" @@ -492,10 +435,7 @@ "end_time": "2024-01-16T16:31:10.175831497Z", "start_time": "2024-01-16T16:31:10.081098696Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -539,10 +479,7 @@ "cell_type": "markdown", "id": "a5f2d451e08ec155", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The results in this case are just four `None` values since our function doesn't return anything:" @@ -557,10 +494,7 @@ "end_time": "2024-01-16T16:31:10.223479222Z", "start_time": "2024-01-16T16:31:10.180906772Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -582,10 +516,7 @@ "cell_type": "markdown", "id": "84a590acb0626ee9", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can also instruct the `apply` method to pass a `DataFrame` instead of a dictionary of properties to the function. This is useful if you want to use the `pandas.DataFrame` API to process the data:" @@ -600,10 +531,7 @@ "end_time": "2024-01-16T16:31:10.254595551Z", "start_time": "2024-01-16T16:31:10.227714969Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -633,10 +561,7 @@ "cell_type": "markdown", "id": "a14646b3cc04daee", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "**WARNING:** The `apply` method does not guarantee that the results will be returned in the same order as the chunks were processed. This is because the chunks are processed in parallel and the order depends on the order in which the parallel processes finish." @@ -646,10 +571,7 @@ "cell_type": "markdown", "id": "39fcfa580de331", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "### Molecule Processors\n", @@ -666,10 +588,7 @@ "end_time": "2024-01-16T16:31:10.307074944Z", "start_time": "2024-01-16T16:31:10.228216373Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -734,10 +653,7 @@ "cell_type": "markdown", "id": "d4a679c7ec23c64a", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "With `processMols`, we can also automatically convert the molecules to RDKit molecules before passing them to the processor:" @@ -752,10 +668,7 @@ "end_time": "2024-01-16T16:31:10.955175012Z", "start_time": "2024-01-16T16:31:10.278782050Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -803,10 +716,7 @@ "cell_type": "markdown", "id": "4927b7b9fe7bfa4c", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "You can also derive from `MolProcessorWithID` if you want to access the molecule IDs provided by the data set in your processor. This is useful to overcome the issue that the order in which chunks are processed is not guaranteed:" @@ -821,10 +731,7 @@ "end_time": "2024-01-16T16:31:12.843689806Z", "start_time": "2024-01-16T16:31:10.956455648Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { From d769e2e51fd818a2d32d4687e446d90a0c4ac974 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 14 Mar 2024 17:28:27 +0100 Subject: [PATCH 08/12] edit CHANGELOG.md --- CHANGELOG.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 383a4b00..9484ad7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ # Change Log -From v3.0.0 to v3.0.1 +From v3.0.1 to v3.1.0 ## Fixes + - Fixed a bug in `QSPRDataset` where property transformations were not applied. ## Changes @@ -12,7 +13,23 @@ From v3.0.0 to v3.0.1 to `PandasDataTable`. - removed `getProperties`, `addProperty`, `removeProperty`, now use `PandasDataTable` methods directly. +- Since the way descriptors are saved has changed, this release is incompatible with + previous data sets and models. However, these can be easily converted to the new + format by adding + a prefix with descriptor set name to the old descriptor tables. Feel free to contact + us if you require assistance with this. ## New Features +- Descriptors are now saved with prefixes to indicate the descriptor sets. This reduces + the chance of name collisions when using multiple descriptor sets. +- Added new methods to `MoleculeTable` and `QSARDataset` for more fine-grained control + of clearing, dropping and restoring of descriptor sets calculated for the dataset. + - `dropDescriptorSets` will drop descriptors associated with the given descriptor + sets. + - `dropDescriptors` will drop individual descriptors associated with the given + descriptor sets and properties. + - All drop actions are restorable with `restoreDescriptorSets` unless explicitly + cleared from the data set with the `clear` parameter of `dropDescriptorSets`. + ## Removed Features From e56cdf791301f9da4f368cc709bbb6792c1d9261 Mon Sep 17 00:00:00 2001 From: "Mw. H.W. van den Maagdenberg MSc" Date: Tue, 19 Mar 2024 09:51:33 +0100 Subject: [PATCH 09/12] add more docs --- qsprpred/data/tables/mol.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index d4f10c83..669321ab 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -685,7 +685,9 @@ def dropDescriptorSets( clear (bool): Whether to remove the descriptor data (will perform full removal). By default, a soft removal is performed by just rendering the - descriptors inactive. + descriptors inactive. A full removal will remove the descriptorSet from the + dataset, including the saved files. It is not possible to restore a + descriptorSet after a full removal. """ # sanity check assert ( From d56ebb4d26fb0efc50ed10baf3ffca13799a73fe Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Tue, 19 Mar 2024 09:51:49 +0100 Subject: [PATCH 10/12] rename clear to full_removal --- qsprpred/data/descriptors/tests.py | 12 +++++++----- qsprpred/data/processing/tests.py | 2 +- qsprpred/data/tables/mol.py | 8 ++++---- qsprpred/extra/data/descriptors/tests.py | 2 +- qsprpred/utils/parallel.py | 2 +- tutorials/advanced/data/parallelization.ipynb | 2 +- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/qsprpred/data/descriptors/tests.py b/qsprpred/data/descriptors/tests.py index 68376997..8dc563d3 100644 --- a/qsprpred/data/descriptors/tests.py +++ b/qsprpred/data/descriptors/tests.py @@ -40,12 +40,14 @@ def testDropping(self): self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) dataset.dropDescriptorSets(dataset.descriptorSets) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) - dataset.dropDescriptorSets(dataset.descriptorSets, clear=True) + dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True) self.assertEqual(len(dataset.descriptors), 0) dataset.addDescriptors(self.getDescList()) dataset.dropDescriptorSets([str(x) for x in self.getDescList()]) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) - dataset.dropDescriptorSets([str(x) for x in self.getDescList()], clear=True) + dataset.dropDescriptorSets( + [str(x) for x in self.getDescList()], full_removal=True + ) self.assertEqual(len(dataset.descriptors), 0) # test dropping of single set dataset.addDescriptors(self.getDescList()) @@ -54,16 +56,16 @@ def testDropping(self): self.assertEqual( dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) ) - dataset.dropDescriptorSets(dataset.descriptorSets, clear=True) + dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True) dataset.addDescriptors(self.getDescList()) - dataset.dropDescriptorSets([str(dataset.descriptorSets[0])], clear=True) + dataset.dropDescriptorSets([str(dataset.descriptorSets[0])], full_removal=True) self.assertEqual( dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) ) # test restoring of dropped sets dataset.addDescriptors(self.getDescList()) self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) - dataset.dropDescriptorSets(dataset.descriptorSets, clear=False) + dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=False) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) dataset.restoreDescriptorSets(dataset.descriptorSets) self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py index 6f112017..3e471ff6 100644 --- a/qsprpred/data/processing/tests.py +++ b/qsprpred/data/processing/tests.py @@ -152,7 +152,7 @@ def setUp(self): self.descriptors = self.dataset.featureNames def recalculateWithMultiIndex(self): - self.dataset.dropDescriptorSets(self.dataset.descriptorSets, clear=True) + self.dataset.dropDescriptorSets(self.dataset.descriptorSets, full_removal=True) self.df_descriptors["ID_COL1"] = ( self.dataset.getProperty(self.dataset.idProp) .apply(lambda x: x.split("_")[0]) diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py index d4f10c83..0dc79971 100644 --- a/qsprpred/data/tables/mol.py +++ b/qsprpred/data/tables/mol.py @@ -673,7 +673,7 @@ def dropDescriptors(self, descriptors: list[str]): def dropDescriptorSets( self, descriptors: list[DescriptorSet | str], - clear: bool = False, + full_removal: bool = False, ): """ Drop descriptors from the given sets from the data frame. @@ -682,7 +682,7 @@ def dropDescriptorSets( descriptors (list[DescriptorSet | str]): List of `DescriptorSet` objects or their names. Name of a descriptor set corresponds to the result returned by its `__str__` method. - clear (bool): + full_removal (bool): Whether to remove the descriptor data (will perform full removal). By default, a soft removal is performed by just rendering the descriptors inactive. @@ -706,7 +706,7 @@ def dropDescriptorSets( calc = ds.calculator if name == str(calc): to_drop.extend(ds.getDescriptorNames()) - if clear: + if full_removal: to_remove.append(idx) self.dropDescriptors(to_drop) for idx in reversed(to_remove): @@ -789,7 +789,7 @@ def addDescriptors( Additional keyword arguments to pass to each descriptor set. """ if recalculate and self.hasDescriptors(): - self.dropDescriptorSets(descriptors, clear=True) + self.dropDescriptorSets(descriptors, full_removal=True) to_calculate = [] for desc_set, exists in zip(descriptors, self.hasDescriptors(descriptors)): if exists: diff --git a/qsprpred/extra/data/descriptors/tests.py b/qsprpred/extra/data/descriptors/tests.py index 9b413769..d5c8c445 100644 --- a/qsprpred/extra/data/descriptors/tests.py +++ b/qsprpred/extra/data/descriptors/tests.py @@ -173,7 +173,7 @@ def testSerialization(self, _, msa_provider_cls: Type[BioPythonMSA]): self.assertTrue(len(dataset_new.featureNames) == len(self.sampleDescSet)) self.assertTrue(all(mol_id in dataset_new.X_ind.index for mol_id in test_ids)) self.assertTrue(all(mol_id in dataset_new.y_ind.index for mol_id in train_ids)) - # clear files and try saving again + # full_removal files and try saving again dataset_new.clearFiles() dataset_new.save() diff --git a/qsprpred/utils/parallel.py b/qsprpred/utils/parallel.py index 03b3ea22..531d1654 100644 --- a/qsprpred/utils/parallel.py +++ b/qsprpred/utils/parallel.py @@ -106,7 +106,7 @@ def parallel_jit_generator( kwargs )) except StopIteration: - # no more data, clear out the slice generator + # no more data, full_removal out the slice generator done = True # wait for a free worker or until all remaining workers finish logger.debug(f"Waiting for {len(queue)} workers to finish...") diff --git a/tutorials/advanced/data/parallelization.ipynb b/tutorials/advanced/data/parallelization.ipynb index ad5992f4..d56f7703 100644 --- a/tutorials/advanced/data/parallelization.ipynb +++ b/tutorials/advanced/data/parallelization.ipynb @@ -138,7 +138,7 @@ " \"\"\"\n", " if data.hasDescriptors([desc_set])[0]:\n", " print(f\"Removing old descriptors: {desc_set}\")\n", - " data.dropDescriptorSets([desc_set], clear=True)\n", + " data.dropDescriptorSets([desc_set], full_removal=True)\n", " print(f\"Running and timing descriptor calculation: {desc_set}\")\n", " watch = StopWatch()\n", " data.addDescriptors([desc_set])\n", From 8e5c31e47b4886a42c744123e6012eacd3680cab Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 28 Mar 2024 15:16:56 +0100 Subject: [PATCH 11/12] reset change log --- CHANGELOG.md | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59cb15da..8ad0554c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,45 +1,18 @@ # Change Log -From v3.0.1 to v3.0.2 +From v3.0.2 to v3.1.0 ## Fixes -- Fixed a bug where an attached standardizer would be refit when calling - `QSPRModel.predictMols` with `use_applicability_domain=True`. -- Fixed a bug with `use_applicability_domain=True` in `QSPRModel.predictMols` - where an error would be raised if there were invalid molecules in the input. -- Fixed a bug where dataset type was not properly set to numeric - in `MlChemADWrapper.contains` -- Fixed a bug in `QSPRDataset` where property transformations were not applied. -- Fixed a bug where an attached standardizer would be refit when calling - `QSPRModel.predictMols` with `use_applicability_domain=True`. -- Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`. -- Fixed a bug where class ratios were shuffled in the `RatioDistributionAlgorithm`. +None. ## Changes -- The module containing the sole model base class (`QSPRModel`) was renamed - from `models` to `model`. -- Restrictions on `numpy` versions were removed to allow for more flexibility in - package installations. However, the `BorutaFilter` feature selection method does not - function with `numpy` versions 1.24.0 and above. Therefore, this functionality now - requires a downgrade to `numpy` version 1.23.0 or lower. This was reflected in the - documentation and `numpy` itself outputs a reasonable error message if the version is - incompatible. -- Data type in `MlChemADWrapper` is now set to `float64` by default, instead - of `float32`. -- Saving of models after hyperparameter optimization was improved to ensure parameters - are always propagated to the underlying estimator as well. +None. ## New Features -- The `DataFrameDescriptorSet` class was extended to allow more flexibility when joining - custom descriptor sets. -- Added the `prepMols` method to `DescriptorSet` to allow separated customization of - molecule preparation before descriptor calculation. -- The package can now be installed from the PyPI repository 🐍📦. -- New argument (`refit_optimal`) was added to `HyperparameterOptimization.optimize()` - method to make refitting of the model with optimal parameters easier. +None. ## Removed Features From 7a8f5befe981d4c53e2fd1f4a398a655779e9e57 Mon Sep 17 00:00:00 2001 From: martin-sicho Date: Thu, 28 Mar 2024 15:21:00 +0100 Subject: [PATCH 12/12] correct version number --- CHANGELOG.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa4cd709..46d4a97d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Change Log -From v3.0.1 to v3.1.0 +From v3.0.2 to v3.1.0 ## Fixes @@ -24,7 +24,6 @@ From v3.0.1 to v3.1.0 ## New Features - - Descriptors are now saved with prefixes to indicate the descriptor sets. This reduces the chance of name collisions when using multiple descriptor sets. - Added new methods to `MoleculeTable` and `QSARDataset` for more fine-grained control @@ -35,10 +34,6 @@ From v3.0.1 to v3.1.0 descriptor sets and properties. - All drop actions are restorable with `restoreDescriptorSets` unless explicitly cleared from the data set with the `clear` parameter of `dropDescriptorSets`. -- The `DataFrameDescriptorSet` class was extended to allow more flexibility when joining - custom descriptor sets. -- Added the `prepMols` method to `DescriptorSet` to allow separated customization of - molecule preparation before descriptor calculation. ## Removed Features