fix read_protein_set to use source_path=self.dataDir

David-Araripe · David-Araripe · commit 9132cc1ccf82 · 2024-01-08T16:54:35.000+01:00
diff --git a/qsprpred/data/sources/papyrus/papyrus_class.py b/qsprpred/data/sources/papyrus/papyrus_class.py
@@ -21,6 +21,7 @@ class Papyrus(DataSource):
     Attributes:
         DEFAULT_DIR (str): default directory for Papyrus database and the extracted data
         dataDir (str): storage directory for Papyrus database and the extracted data
+        _papyrusDir (str): directory where the Papyrus database is located, os.path.join(dataDir, "papyrus")
         version (list): Papyrus database version
         descriptors (list, str, None): descriptors to download if not already present
         stereo (bool): use version with stereochemistry
@@ -60,6 +61,7 @@ def __init__(
                 use only plusplus version, only high quality data
         """
         self.dataDir = data_dir
+        self._papyrusDir = os.path.join(self.dataDir, "papyrus")
         self.version = version
         self.descriptors = descriptors
         self.stereo = stereo
@@ -73,8 +75,8 @@ def download(self):
         Only newly requested data is downloaded. Remove the files if you want to
         reload the data completely.
         """
-        os.makedirs(self.dataDir, exist_ok=True)
-        if not os.path.exists(os.path.join(self.dataDir, "papyrus")):
+        if not os.path.exists(self._papyrusDir):
+            os.makedirs(self.dataDir, exist_ok=True)
             logger.info("Downloading Papyrus database...")
             download_papyrus(
                 outdir=self.dataDir,
@@ -87,10 +89,8 @@ def download(self):
             )
         else:
             logger.info(
-                f"Papyrus database already"
-                f" downloaded. Using existing data. "
-                f"Delete the following folder to reload the data: "
-                f"{os.path.join(self.dataDir, 'papyrus')}"
+                "Papyrus database already downloaded. Using existing data. "
+                f"Delete the following folder to reload the data: {self._papyrusDir}"
             )
 
     def getData(
@@ -121,7 +121,7 @@ def getData(
         Returns:
             MolculeTable: the filtered data set
         """
-        logger.debug(f"Getting data from Papyrus data source...")
+        logger.debug("Getting data from Papyrus data source...")
         assert acc_keys is not None, "Please provide a list of accession keys."
         name = name or "papyrus"
         self.download()
@@ -143,7 +143,7 @@ def getData(
             plusplus=self.plusplus,
             papyrus_dir=self.dataDir,
         )
-        logger.debug(f"Finished filtering Papyrus data set.")
+        logger.debug("Finished filtering Papyrus data set.")
         logger.debug(f"Creating MoleculeTable from '{path}'.")
         ret = MoleculeTable.fromTableFile(name, path, store_dir=output_dir, **kwargs)
         logger.debug(f"Finished creating MoleculeTable from '{path}'.")
@@ -175,7 +175,9 @@ def getProteinData(
         if os.path.exists(path) and use_existing:
             return pd.read_table(path)
         else:
-            protein_data = papyrus_scripts.read_protein_set(version=self.version)
+            protein_data = papyrus_scripts.read_protein_set(
+                source_path=self.dataDir, version=self.version
+            )
             protein_data["accession"] = protein_data["target_id"].apply(
                 lambda x: x.split("_")[0]
             )