Merge pull request #169 from NPLinker/refactor_gnps_classes

Refactor gnps classes and functions
NPLinker · Aug 29, 2023 · 4d230e9 · 4d230e9
2 parents 80a8299 + 69b73ae
commit 4d230e9
Show file tree

Hide file tree

Showing 47 changed files with 1,147 additions and 2,035 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -5,8 +5,8 @@ httpx
 numpy
 pandas
 progress
+pyteomics
 pytest-lazy-fixture
-requests
 scipy
 sortedcontainers
 toml

diff --git a/src/nplinker/annotations.py b/src/nplinker/annotations.py
@@ -22,7 +22,7 @@
 
 logger = LogConfig.getLogger(__name__)
 
-GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}'
+GNPS_URL_FORMAT = "https://metabolomics-usi.gnps2.org/{}/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{}"
 GNPS_INDEX_COLUMN = '#Scan#'
 GNPS_DATA_COLUMNS = ['Compound_Name', 'Organism', 'MQScore', 'SpectrumID']
 

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -2,7 +2,6 @@
 from os import PathLike
 from pathlib import Path
 import shutil
-from urllib.error import HTTPError
 from nplinker.logconfig import LogConfig
 from nplinker.utils import download_and_extract_archive
 from nplinker.utils import list_dirs
@@ -74,12 +73,10 @@ def download_and_extract_antismash_data(antismash_id: str,
         logger.info('antiSMASH BGC data of %s is downloaded and extracted.',
                     antismash_id)
 
-    except HTTPError as e:
+    except Exception as e:
         shutil.rmtree(extract_path)
         logger.warning(e)
-        raise HTTPError(e.url, e.code,
-                        f"Could not find a valid url for {antismash_id}",
-                        e.headers, e.fp) from e
+        raise e
 
 
 def _check_roots(download_root: PathLike, extract_root: PathLike):

diff --git a/src/nplinker/metabolomics/abc.py b/src/nplinker/metabolomics/abc.py
@@ -1,32 +1,36 @@
-from abc import ABC, abstractmethod
-
-from nplinker.metabolomics.molecular_family import MolecularFamily
+from abc import ABC
+from abc import abstractmethod
 from collections.abc import Sequence
+from nplinker.metabolomics.molecular_family import MolecularFamily
 from nplinker.metabolomics.spectrum import Spectrum
 
 
 class SpectrumLoaderBase(ABC):
-
+
+    @property
     @abstractmethod
     def spectra(self) -> Sequence[Spectrum]:
         ...
 
 class MolecularFamilyLoaderBase(ABC):
-
+
+    @property
     @abstractmethod
     def families(self) -> Sequence[MolecularFamily]:
         ...
 
 
 class FileMappingLoaderBase(ABC):
 
+    @property
     @abstractmethod
-    def mapping(self) -> dict[int, list[str]]:
+    def mappings(self) -> dict[str, list[str]]:
         ...
 
 
 class AnnotationLoaderBase(ABC):
 
+    @property
     @abstractmethod
-    def get_annotations(self) -> dict[int, dict]:
-        ...
+    def annotations(self) -> dict[str, dict]:
+        ...
diff --git a/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py b/src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
@@ -1,42 +1,132 @@
 import csv
 from os import PathLike
 from pathlib import Path
-from typing import Any
 from nplinker.metabolomics.abc import AnnotationLoaderBase
+from nplinker.utils import is_file_format
 
 
-GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}'
+GNPS_UNIVERSAL_SPECTRUM_IDENTIFIER_URL = "https://metabolomics-usi.gnps2.org/{}/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{}"
+
 
 class GNPSAnnotationLoader(AnnotationLoaderBase):
+
     def __init__(self, file: str | PathLike):
         """Load annotations from GNPS output file.
 
+        The annotation file is a .tsv file from GNPS output archive, as described
+        below for each GNPS workflow type:
+        1. METABOLOMICS-SNETS
+            - result_specnets_DB/*.tsv
+        2. METABOLOMICS-SNETS-V2
+            - result_specnets_DB/.tsv
+        3. FEATURE-BASED-MOLECULAR-NETWORKING
+            - DB_result/*.tsv
+
         Args:
             file(str | PathLike): The GNPS annotation file.
+
+        Example:
+            >>> loader = GNPSAnnotationLoader("gnps_annotations.tsv")
+            >>> print(loader.annotations["100"])
+            {'#Scan#': '100',
+            'Adduct': 'M+H',
+            'CAS_Number': 'N/A',
+            'Charge': '1',
+            'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',
+            'Compound_Source': 'NIH Pharmacologically Active Library',
+            'Data_Collector': 'VP/LMS',
+            'ExactMass': '274.992',
+            'INCHI': 'N/A',
+            'INCHI_AUX': 'N/A',
+            'Instrument': 'qTof',
+            'IonMode': 'Positive',
+            'Ion_Source': 'LC-ESI',
+            'LibMZ': '276.003',
+            'LibraryName': 'lib-00014.mgf',
+            'LibraryQualityString': 'Gold',
+            'Library_Class': '1',
+            'MQScore': '0.704152',
+            'MZErrorPPM': '405416',
+            'MassDiff': '111.896',
+            'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',
+            'PI': 'Dorrestein',
+            'Precursor_MZ': '276.003',
+            'Pubmed_ID': 'N/A',
+            'RT_Query': '795.979',
+            'SharedPeaks': '7',
+            'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',
+            'SpecCharge': '1',
+            'SpecMZ': '164.107',
+            'SpectrumFile': 'spectra/specs_ms.pklbin',
+            'SpectrumID': 'CCMSLIB00000086167',
+            'TIC_Query': '986.997',
+            'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',
+            'tags': ' ',
+            'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
+            'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
+            'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
+            'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}
         """
         self._file = Path(file)
-        self._annotations : dict[str, dict] = {}
+        self._annotations: dict[str, dict] = {}
 
-        with open(self._file, mode='rt', encoding='UTF-8') as f:
-            header = f.readline().split('\t')
-            dict_reader = csv.DictReader(f, header, delimiter='\t')
-            for row in dict_reader:
-                scan_id = row.pop('#Scan#')
-                self._annotations[scan_id] = row
+        self._validate()
+        self._load()
 
-                # also insert useful URLs
-                for t in ['png', 'json', 'svg', 'spectrum']:
-                    self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID'])
+    @property
+    def annotations(self) -> dict[str, dict]:
+        """Get annotations.
 
+        Returns:
+            dict[str, dict]: Keys are spectrum ids ("#Scan#" in annotation file)
+                and values are the annotations dict for each spectrum.
+        """
+        return self._annotations
 
+    def _validate(self) -> None:
+        """Validate the annotation file.
 
-    def get_annotations(self) -> dict[str, dict]:
-        """Get annotations.
+        Raises:
+            ValueError: Raises ValueError if the file is not valid.
+        """
+        # validate file format
+        if not is_file_format(self._file, 'tsv'):
+            raise ValueError(f"Invalid GNPS annotation file '{self._file}'. "
+                             f"Expected a .tsv file.")
 
-        Returns:
-            dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum.
+        # validate required columns against the header
+        required_columns = [
+            '#Scan#', 'Compound_Name', 'Organism', 'MQScore', 'SpectrumID'
+        ]
+        with open(self._file, mode='rt') as f:
+            header = f.readline()
+            for k in required_columns:
+                if k not in header:
+                    raise ValueError(
+                        f"Invalid GNPS annotation file '{self._file}'. "
+                        f"Expected a header line with '{k}' column, "
+                        f"but got '{header}'.")
 
-        Examples:
-            >>> print(loader.annotations()[100])
-            """
-        return self._annotations
+        # validate that "#Scan#" must be unique
+        with open(self._file, mode='rt') as f:
+            reader = csv.DictReader(f, delimiter='\t')
+            scans = [row["#Scan#"] for row in reader]
+        duplicates = {x for x in scans if scans.count(x) > 1}
+        if len(duplicates) > 0:
+            raise ValueError(
+                f"Invalid GNPS annotation file '{self._file}'. "
+                f"Expected unique '#Scan#', but found duplicates '{duplicates}'."
+            )
+
+    def _load(self) -> None:
+        """Load the annotations from the file."""
+        with open(self._file, mode='rt') as f:
+            dict_reader = csv.DictReader(f, delimiter='\t')
+            for row in dict_reader:
+                scan_id = row['#Scan#']
+                self._annotations[scan_id] = row
+                # insert useful URLs
+                for t in ['png', 'json', 'svg', 'spectrum']:
+                    self._annotations[scan_id][
+                        f'{t}_url'] = GNPS_UNIVERSAL_SPECTRUM_IDENTIFIER_URL.format(
+                            t, row['SpectrumID'])
diff --git a/src/nplinker/metabolomics/gnps/gnps_downloader.py b/src/nplinker/metabolomics/gnps/gnps_downloader.py
@@ -1,67 +1,88 @@
 from os import PathLike
 from pathlib import Path
-import httpx
-
-from .gnps_format import GNPSFormat
-from .gnps_format import gnps_format_from_task_id
 from typing_extensions import Self
+from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_task_id
+from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
+from nplinker.utils import download_url
 
 
 class GNPSDownloader:
     GNPS_DATA_DOWNLOAD_URL = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra'
     GNPS_DATA_DOWNLOAD_URL_FBMN = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data'
 
     def __init__(self, task_id: str, download_root: str | PathLike):
-        """Class to download GNPS output archive for the given task id.
+        """Download GNPS zip archive for the given task id.
+
+        Note that only GNPS workflows listed in the GNPSFormat enum are supported.
 
         Args:
             task_id(str): GNPS task id, identifying the data to be downloaded.
             download_root(Path): Path where to store the downloaded archive.
 
+        Raises:
+            ValueError: If the given task id does not correspond to a supported
+                GNPS workflow.
+
         Examples:
             >>> GNPSDownloader("c22f44b14a3d450eb836d607cb9521bb", "~/downloads")
-            """
+        """
+        gnps_format = gnps_format_from_task_id(task_id)
+        if gnps_format == GNPSFormat.Unknown:
+            raise ValueError(
+                f"Unknown workflow type for GNPS task '{task_id}'."
+                f"Supported GNPS workflows are described in the GNPSFormat enum, "
+                f"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' "
+                f"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.")
+
         self._task_id = task_id
         self._download_root: Path = Path(download_root)
+        self._gnps_format = gnps_format
+        self._file_name = gnps_format.value + "-" + self._task_id + ".zip"
+
+    @property
+    def gnps_format(self) -> GNPSFormat:
+        """Get the GNPS workflow type.
+
+        Returns:
+            GNPSFormat: GNPS workflow type.
+        """
+        return self._gnps_format
 
 
     def download(self) -> Self:
-        """Execute the downloading process. """
-        with open(self.get_download_path(), 'wb') as f:
-            with httpx.stream('POST', self.get_url()) as r:
-                for data in r.iter_bytes():
-                    f.write(data)
+        """Execute the downloading process.
+
+        Note: GNPS data is downloaded using the POST method (empty payload is OK).
+        """
+        download_url(self.get_url(),
+                     self._download_root,
+                     filename=self._file_name,
+                     http_method="POST")
         return self
-   
-    def get_download_path(self) -> str:
-        """Get the path where to store the downloaded file.
+
+    def get_download_file(self) -> str:
+        """Get the path to the zip file.
 
         Returns:
             str: Download path as string
         """
-        return str(self._download_root.joinpath(self._task_id + ".zip"))
-    
+        return str(Path(self._download_root) / self._file_name)
+
     def get_task_id(self) -> str:
         """Get the GNPS task id.
 
         Returns:
             str: Task id as string.
         """
         return self._task_id
-    
+
     def get_url(self) -> str:
         """Get the full URL linking to GNPS data to be dowloaded.
 
         Returns:
             str: URL pointing to the GNPS data to be downloaded.
         """
-
-        gnps_format = gnps_format_from_task_id(self._task_id)
-
-        if gnps_format == GNPSFormat.FBMN:
-            return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id)
-
+        if self.gnps_format == GNPSFormat.FBMN:
+            return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(
+                self._task_id)
         return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id)
-
-
-