fix mypy and ruff errors (#257)

* use overload for MetcalfScoring * add networkx stub file * fix mypy errors fix code or ignore type checking for some nonsense mypy errors * fix ruff check errors for refactored code * run ruff format * fix imports * uniform the use of TYPE_CHECKING to only avoid circular imports This ensures that the type hints are available both during type checking and at runtime, improving code clarity and reducing the chance of runtime errors related to type hints. * fix non-existing attribute bug * fix typos * use broader type hints Sequence and Mapping use broader type hints Sequence and Mapping to replace list and dict, respectively * change `datas` to `data` * use specific types for return of abstract method when possible use more general type when necessary
NPLinker · Jun 14, 2024 · 6a6f170 · 6a6f170
1 parent fc9ddec
commit 6a6f170
Show file tree

Hide file tree

Showing 40 changed files with 188 additions and 143 deletions.
diff --git a/README.dev.md b/README.dev.md
@@ -33,7 +33,7 @@ python3 -m pip install --upgrade pip setuptools
 # install development dependencies
 pip install --no-cache-dir --editable ".[dev]"
 
-# install non-pypi dependecies
+# install non-pypi dependencies
 install-nplinker-deps
 ```
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ dev = [
     "types-Deprecated",
     "types-beautifulsoup4",
     "types-jsonschema",
+    "types-networkx",
     "pandas-stubs",
     # docs
     "mkdocs",

diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py
@@ -171,7 +171,7 @@ def _get_gnps_file_mappings_file(self) -> Path:
             file_mappings_tsv if file_mappings_tsv.exists() else file_mappings_csv
         )
 
-        return gnps_file_mappings_file
+        return gnps_file_mappings_file  # type: ignore
 
     def _download_and_extract_gnps(self) -> None:
         """Download and extract the GNPS data.
@@ -304,7 +304,7 @@ def arrange_strain_mappings(self) -> None:
         If `self.config.mode` is "local", validate the strain mappings file.
         If `self.config.mode` is "podp", always generate the strain mappings file and validate it.
 
-        The valiation checks if the strain mappings file exists and if it is a valid JSON file
+        The validation checks if the strain mappings file exists and if it is a valid JSON file
         according to the schema defined in `schemas/strain_mappings_schema.json`.
         """
         if self.config.mode == "podp":

diff --git a/src/nplinker/class_info/runcanopus.py b/src/nplinker/class_info/runcanopus.py
@@ -81,7 +81,3 @@ def run_canopus(mgf_file, output_path, extra_params="--maxmz 600 formula zodiac
     open(os.path.join(output_path, "completed"), "w").close()
 
     return True
-
-
-if __name__ == "__main__":
-    run_canopus(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
diff --git a/src/nplinker/config.py b/src/nplinker/config.py
@@ -36,7 +36,7 @@ def load_config(config_file: str | PathLike) -> Dynaconf:
 
 
 # Note:
-# Validataor parameter `required=False` means the setting (e.g. "loglevel") must not exist rather
+# Validator parameter `required=False` means the setting (e.g. "loglevel") must not exist rather
 # than being optional. So don't set the parameter `required` if the key is optional.
 CONFIG_VALIDATORS = [
     # General settings

diff --git a/src/nplinker/genomics/abc.py b/src/nplinker/genomics/abc.py
@@ -1,14 +1,13 @@
 from abc import ABC
 from abc import abstractmethod
-from collections.abc import Sequence
 from .bgc import BGC
 from .gcf import GCF
 
 
 class BGCLoaderBase(ABC):
     """Abstract base class for BGC loader."""
 
-    def __init__(self, data_dir: str):
+    def __init__(self, data_dir: str) -> None:
         """Initialize the BGC loader.
 
         Args:
@@ -26,7 +25,7 @@ def get_files(self) -> dict[str, str]:
         """
 
     @abstractmethod
-    def get_bgcs(self) -> Sequence[BGC]:
+    def get_bgcs(self) -> list[BGC]:
         """Get BGC objects.
 
         Returns:
@@ -38,7 +37,7 @@ class GCFLoaderBase(ABC):
     """Abstract base class for GCF loader."""
 
     @abstractmethod
-    def get_gcfs(self, keep_mibig_only: bool, keep_singleton: bool) -> Sequence[GCF]:
+    def get_gcfs(self, keep_mibig_only: bool, keep_singleton: bool) -> list[GCF]:
         """Get GCF objects.
 
         Args:

diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -2,6 +2,7 @@
 import fnmatch
 import logging
 import os
+from typing import Mapping
 from Bio import SeqIO
 from Bio import SeqRecord
 from nplinker.genomics import BGC
@@ -97,7 +98,7 @@ def get_bgcs(self) -> list[BGC]:
         return self._bgcs
 
     @staticmethod
-    def _parse_bgcs(bgc_files: dict[str, str]) -> list[BGC]:
+    def _parse_bgcs(bgc_files: Mapping[str, str]) -> list[BGC]:
         """Load given BGC files as BGC objects.
 
         Args:

diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py
@@ -5,6 +5,8 @@
 import time
 from os import PathLike
 from pathlib import Path
+from typing import Mapping
+from typing import Sequence
 import httpx
 from bs4 import BeautifulSoup
 from bs4 import NavigableString
@@ -82,7 +84,7 @@ def read_json(file: str | PathLike) -> dict[str, "GenomeStatus"]:
 
     @staticmethod
     def to_json(
-        genome_status_dict: dict[str, "GenomeStatus"], file: str | PathLike | None = None
+        genome_status_dict: Mapping[str, "GenomeStatus"], file: str | PathLike | None = None
     ) -> str | None:
         """Convert the genome status dictionary to a JSON string.
 
@@ -122,7 +124,7 @@ def _to_dict(self) -> dict:
 
 
 def podp_download_and_extract_antismash_data(
-    genome_records: list[dict[str, dict[str, str]]],
+    genome_records: Sequence[Mapping[str, Mapping[str, str]]],
     project_download_root: str | PathLike,
     project_extract_root: str | PathLike,
 ):
@@ -220,7 +222,7 @@ def podp_download_and_extract_antismash_data(
         raise ValueError("No antiSMASH data found for any genome")
 
 
-def get_best_available_genome_id(genome_id_data: dict[str, str]) -> str | None:
+def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | None:
     """Get the best available ID from genome_id_data dict.
 
     Args:
@@ -359,7 +361,7 @@ def _resolve_jgi_accession(jgi_id: str) -> str:
     return _resolve_genbank_accession(link.text)
 
 
-def _resolve_refseq_id(genome_id_data: dict[str, str]) -> str:
+def _resolve_refseq_id(genome_id_data: Mapping[str, str]) -> str:
     """Get the RefSeq ID to which the genome accession is linked.
 
     Check https://pairedomicsdata.bioinformatics.nl/schema.json.

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -2,11 +2,11 @@
 import logging
 from typing import TYPE_CHECKING
 from deprecated import deprecated
+from nplinker.strain import Strain
 from .aa_pred import predict_aa
 
 
 if TYPE_CHECKING:
-    from ..strain import Strain
     from .gcf import GCF
 
 logger = logging.getLogger(__name__)

diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING
+from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
 
 
 if TYPE_CHECKING:
-    from nplinker.strain import Strain
     from .bgc import BGC
 
 logger = logging.getLogger(__name__)

diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -21,14 +21,14 @@ class MibigLoader:
     """
 
     def __init__(self, data_dir: str):
-        """Initialize the MIBiG metatdata loader.
+        """Initialize the MIBiG metadata loader.
 
         Args:
             data_dir: Path to the directory of MIBiG metadata json files
         """
         self.data_dir = data_dir
         self._file_dict = self.parse_data_dir(self.data_dir)
-        self._metadata_dict = self._parse_metadatas()
+        self._metadata_dict = self._parse_metadata()
         self._bgcs = self._parse_bgcs()
 
     def get_files(self) -> dict[str, str]:
@@ -58,15 +58,15 @@ def parse_data_dir(data_dir: str) -> dict[str, str]:
             file_dict[fname] = file
         return file_dict
 
-    def get_metadatas(self) -> dict[str, MibigMetadata]:
+    def get_metadata(self) -> dict[str, MibigMetadata]:
         """Get MibigMetadata objects.
 
         Returns:
             The key is BGC accession (file name) and the value is MibigMetadata object
         """
         return self._metadata_dict
 
-    def _parse_metadatas(self) -> dict[str, MibigMetadata]:
+    def _parse_metadata(self) -> dict[str, MibigMetadata]:
         """Parse all metadata files and return MibigMetadata objects.
 
         Returns:

diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py
@@ -3,6 +3,8 @@
 import logging
 from os import PathLike
 from pathlib import Path
+from typing import Mapping
+from typing import Sequence
 from jsonschema import validate
 from nplinker.defaults import GENOME_BGC_MAPPINGS_FILENAME
 from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA
@@ -65,7 +67,9 @@ def generate_mappings_genome_id_bgc_id(
     logger.info("Generated genome-BGC mappings file: %s", output_file)
 
 
-def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[BGC], list[BGC]]:
+def add_strain_to_bgc(
+    strains: StrainCollection, bgcs: Sequence[BGC]
+) -> tuple[list[BGC], list[BGC]]:
     """Assign a Strain object to `BGC.strain` for input BGCs.
 
     BGC id is used to find the corresponding Strain object. It's possible that
@@ -111,7 +115,7 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[
 
 
 def add_bgc_to_gcf(
-    bgcs: list[BGC], gcfs: list[GCF]
+    bgcs: Sequence[BGC], gcfs: Sequence[GCF]
 ) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]:
     """Add BGC objects to GCF object based on GCF's BGC ids.
 
@@ -165,7 +169,7 @@ def add_bgc_to_gcf(
     return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc
 
 
-def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]:
+def get_mibig_from_gcf(gcfs: Sequence[GCF]) -> tuple[list[BGC], StrainCollection]:
     """Get MIBiG BGCs and strains from GCF objects.
 
     Args:
@@ -277,9 +281,9 @@ def extract_mappings_resolved_genome_id_bgc_id(
 
 
 def get_mappings_strain_id_bgc_id(
-    mappings_strain_id_original_genome_id: dict[str, set[str]],
-    mappings_original_genome_id_resolved_genome_id: dict[str, str],
-    mappings_resolved_genome_id_bgc_id: dict[str, set[str]],
+    mappings_strain_id_original_genome_id: Mapping[str, set[str]],
+    mappings_original_genome_id_resolved_genome_id: Mapping[str, str],
+    mappings_resolved_genome_id_bgc_id: Mapping[str, set[str]],
 ) -> dict[str, set[str]]:
     """Get mappings "strain_id <-> bgc_id".
 

diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -1,16 +1,21 @@
+from __future__ import annotations
 import logging
 import os
 from deprecated import deprecated
 from dynaconf import Dynaconf
 from nplinker import NPLINKER_APP_DATA_DIR
 from nplinker import defaults
+from nplinker.genomics import BGC
+from nplinker.genomics import GCF
 from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.bigscape import BigscapeGCFLoader
 from nplinker.genomics.bigscape import BigscapeV2GCFLoader
 from nplinker.genomics.mibig import MibigLoader
 from nplinker.genomics.utils import add_bgc_to_gcf
 from nplinker.genomics.utils import add_strain_to_bgc
 from nplinker.genomics.utils import get_mibig_from_gcf
+from nplinker.metabolomics import MolecularFamily
+from nplinker.metabolomics import Spectrum
 from nplinker.metabolomics.gnps import GNPSAnnotationLoader
 from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader
 from nplinker.metabolomics.gnps import GNPSSpectrumLoader
@@ -58,11 +63,14 @@ def __init__(self, config: Dynaconf):
         """
         self.config = config
 
-        self.bgcs, self.gcfs, self.spectra, self.mfs = [], [], [], []
-        self.mibig_bgcs = []
-        self.mibig_strains_in_use = StrainCollection()
-        self.product_types = []
-        self.strains = StrainCollection()
+        self.bgcs: list[BGC] = []
+        self.gcfs: list[GCF] = []
+        self.spectra: list[Spectrum] = []
+        self.mfs: list[MolecularFamily] = []
+        self.mibig_bgcs: list[BGC] = []
+        self.mibig_strains_in_use: StrainCollection = StrainCollection()
+        self.product_types: list = []
+        self.strains: StrainCollection = StrainCollection()
 
         self.class_matches = None
         self.chem_classes = None
@@ -93,7 +101,7 @@ def _load_strain_mappings(self):
             self.strains.add(strain)
         logger.info("Loaded {} non-MiBIG Strain objects".format(len(self.strains)))
 
-        # 2. filter user specificied strains (remove all that are not specified by user).
+        # 2. filter user specified strains (remove all that are not specified by user).
         # It's not allowed to specify empty list of strains, otherwise validation will fail.
         user_strains_file = self.config.root_dir / defaults.STRAINS_SELECTED_FILENAME
         if user_strains_file.exists():

diff --git a/src/nplinker/metabolomics/abc.py b/src/nplinker/metabolomics/abc.py
@@ -1,23 +1,27 @@
 from abc import ABC
 from abc import abstractmethod
-from collections.abc import Sequence
-from typing import TYPE_CHECKING
-
-
-if TYPE_CHECKING:
-    from .molecular_family import MolecularFamily
-    from .spectrum import Spectrum
+from .molecular_family import MolecularFamily
+from .spectrum import Spectrum
 
 
 class SpectrumLoaderBase(ABC):
+    """Abstract base class for SpectrumLoader."""
+
     @property
     @abstractmethod
-    def spectra(self) -> Sequence["Spectrum"]: ...
+    def spectra(self) -> list["Spectrum"]:
+        """Get Spectrum objects.
+
+        Returns:
+            A sequence of Spectrum objects.
+        """
 
 
 class MolecularFamilyLoaderBase(ABC):
+    """Abstract base class for MolecularFamilyLoader."""
+
     @abstractmethod
-    def get_mfs(self, keep_singleton: bool) -> Sequence["MolecularFamily"]:
+    def get_mfs(self, keep_singleton: bool) -> list["MolecularFamily"]:
         """Get MolecularFamily objects.
 
         Args:
@@ -26,17 +30,31 @@ def get_mfs(self, keep_singleton: bool) -> Sequence["MolecularFamily"]:
                 only one spectrum.
 
         Returns:
-            A list of MolecularFamily objects.
+            A sequence of MolecularFamily objects.
         """
 
 
 class FileMappingLoaderBase(ABC):
+    """Abstract base class for FileMappingLoader."""
+
     @property
     @abstractmethod
-    def mappings(self) -> dict[str, list[str]]: ...
+    def mappings(self) -> dict[str, list[str]]:
+        """Get file mappings.
+
+        Returns:
+            A mapping from spectrum ID to the names of files where the spectrum occurs.
+        """
 
 
 class AnnotationLoaderBase(ABC):
+    """Abstract base class for AnnotationLoader."""
+
     @property
     @abstractmethod
-    def annotations(self) -> dict[str, dict]: ...
+    def annotations(self) -> dict[str, dict]:
+        """Get annotations.
+
+        Returns:
+            A mapping from spectrum ID to its annotations.
+        """