Skip to content

Commit

Permalink
Merge pull request #169 from NPLinker/refactor_gnps_classes
Browse files Browse the repository at this point in the history
Refactor gnps classes and functions
  • Loading branch information
CunliangGeng authored Aug 29, 2023
2 parents 80a8299 + 69b73ae commit 4d230e9
Show file tree
Hide file tree
Showing 47 changed files with 1,147 additions and 2,035 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ httpx
numpy
pandas
progress
pyteomics
pytest-lazy-fixture
requests
scipy
sortedcontainers
toml
Expand Down
2 changes: 1 addition & 1 deletion src/nplinker/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

logger = LogConfig.getLogger(__name__)

GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}'
GNPS_URL_FORMAT = "https://metabolomics-usi.gnps2.org/{}/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{}"
GNPS_INDEX_COLUMN = '#Scan#'
GNPS_DATA_COLUMNS = ['Compound_Name', 'Organism', 'MQScore', 'SpectrumID']

Expand Down
7 changes: 2 additions & 5 deletions src/nplinker/genomics/antismash/antismash_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from os import PathLike
from pathlib import Path
import shutil
from urllib.error import HTTPError
from nplinker.logconfig import LogConfig
from nplinker.utils import download_and_extract_archive
from nplinker.utils import list_dirs
Expand Down Expand Up @@ -74,12 +73,10 @@ def download_and_extract_antismash_data(antismash_id: str,
logger.info('antiSMASH BGC data of %s is downloaded and extracted.',
antismash_id)

except HTTPError as e:
except Exception as e:
shutil.rmtree(extract_path)
logger.warning(e)
raise HTTPError(e.url, e.code,
f"Could not find a valid url for {antismash_id}",
e.headers, e.fp) from e
raise e


def _check_roots(download_root: PathLike, extract_root: PathLike):
Expand Down
20 changes: 12 additions & 8 deletions src/nplinker/metabolomics/abc.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,36 @@
from abc import ABC, abstractmethod

from nplinker.metabolomics.molecular_family import MolecularFamily
from abc import ABC
from abc import abstractmethod
from collections.abc import Sequence
from nplinker.metabolomics.molecular_family import MolecularFamily
from nplinker.metabolomics.spectrum import Spectrum


class SpectrumLoaderBase(ABC):


@property
@abstractmethod
def spectra(self) -> Sequence[Spectrum]:
...

class MolecularFamilyLoaderBase(ABC):


@property
@abstractmethod
def families(self) -> Sequence[MolecularFamily]:
...


class FileMappingLoaderBase(ABC):

@property
@abstractmethod
def mapping(self) -> dict[int, list[str]]:
def mappings(self) -> dict[str, list[str]]:
...


class AnnotationLoaderBase(ABC):

@property
@abstractmethod
def get_annotations(self) -> dict[int, dict]:
...
def annotations(self) -> dict[str, dict]:
...
130 changes: 110 additions & 20 deletions src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,132 @@
import csv
from os import PathLike
from pathlib import Path
from typing import Any
from nplinker.metabolomics.abc import AnnotationLoaderBase
from nplinker.utils import is_file_format


GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}'
GNPS_UNIVERSAL_SPECTRUM_IDENTIFIER_URL = "https://metabolomics-usi.gnps2.org/{}/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{}"


class GNPSAnnotationLoader(AnnotationLoaderBase):

def __init__(self, file: str | PathLike):
"""Load annotations from GNPS output file.
The annotation file is a .tsv file from GNPS output archive, as described
below for each GNPS workflow type:
1. METABOLOMICS-SNETS
- result_specnets_DB/*.tsv
2. METABOLOMICS-SNETS-V2
- result_specnets_DB/.tsv
3. FEATURE-BASED-MOLECULAR-NETWORKING
- DB_result/*.tsv
Args:
file(str | PathLike): The GNPS annotation file.
Example:
>>> loader = GNPSAnnotationLoader("gnps_annotations.tsv")
>>> print(loader.annotations["100"])
{'#Scan#': '100',
'Adduct': 'M+H',
'CAS_Number': 'N/A',
'Charge': '1',
'Compound_Name': 'MLS002153841-01!Iobenguane sulfate',
'Compound_Source': 'NIH Pharmacologically Active Library',
'Data_Collector': 'VP/LMS',
'ExactMass': '274.992',
'INCHI': 'N/A',
'INCHI_AUX': 'N/A',
'Instrument': 'qTof',
'IonMode': 'Positive',
'Ion_Source': 'LC-ESI',
'LibMZ': '276.003',
'LibraryName': 'lib-00014.mgf',
'LibraryQualityString': 'Gold',
'Library_Class': '1',
'MQScore': '0.704152',
'MZErrorPPM': '405416',
'MassDiff': '111.896',
'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE',
'PI': 'Dorrestein',
'Precursor_MZ': '276.003',
'Pubmed_ID': 'N/A',
'RT_Query': '795.979',
'SharedPeaks': '7',
'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O',
'SpecCharge': '1',
'SpecMZ': '164.107',
'SpectrumFile': 'spectra/specs_ms.pklbin',
'SpectrumID': 'CCMSLIB00000086167',
'TIC_Query': '986.997',
'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD',
'tags': ' ',
'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167',
'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'}
"""
self._file = Path(file)
self._annotations : dict[str, dict] = {}
self._annotations: dict[str, dict] = {}

with open(self._file, mode='rt', encoding='UTF-8') as f:
header = f.readline().split('\t')
dict_reader = csv.DictReader(f, header, delimiter='\t')
for row in dict_reader:
scan_id = row.pop('#Scan#')
self._annotations[scan_id] = row
self._validate()
self._load()

# also insert useful URLs
for t in ['png', 'json', 'svg', 'spectrum']:
self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID'])
@property
def annotations(self) -> dict[str, dict]:
"""Get annotations.
Returns:
dict[str, dict]: Keys are spectrum ids ("#Scan#" in annotation file)
and values are the annotations dict for each spectrum.
"""
return self._annotations

def _validate(self) -> None:
"""Validate the annotation file.
def get_annotations(self) -> dict[str, dict]:
"""Get annotations.
Raises:
ValueError: Raises ValueError if the file is not valid.
"""
# validate file format
if not is_file_format(self._file, 'tsv'):
raise ValueError(f"Invalid GNPS annotation file '{self._file}'. "
f"Expected a .tsv file.")

Returns:
dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum.
# validate required columns against the header
required_columns = [
'#Scan#', 'Compound_Name', 'Organism', 'MQScore', 'SpectrumID'
]
with open(self._file, mode='rt') as f:
header = f.readline()
for k in required_columns:
if k not in header:
raise ValueError(
f"Invalid GNPS annotation file '{self._file}'. "
f"Expected a header line with '{k}' column, "
f"but got '{header}'.")

Examples:
>>> print(loader.annotations()[100])
"""
return self._annotations
# validate that "#Scan#" must be unique
with open(self._file, mode='rt') as f:
reader = csv.DictReader(f, delimiter='\t')
scans = [row["#Scan#"] for row in reader]
duplicates = {x for x in scans if scans.count(x) > 1}
if len(duplicates) > 0:
raise ValueError(
f"Invalid GNPS annotation file '{self._file}'. "
f"Expected unique '#Scan#', but found duplicates '{duplicates}'."
)

def _load(self) -> None:
"""Load the annotations from the file."""
with open(self._file, mode='rt') as f:
dict_reader = csv.DictReader(f, delimiter='\t')
for row in dict_reader:
scan_id = row['#Scan#']
self._annotations[scan_id] = row
# insert useful URLs
for t in ['png', 'json', 'svg', 'spectrum']:
self._annotations[scan_id][
f'{t}_url'] = GNPS_UNIVERSAL_SPECTRUM_IDENTIFIER_URL.format(
t, row['SpectrumID'])
73 changes: 47 additions & 26 deletions src/nplinker/metabolomics/gnps/gnps_downloader.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,88 @@
from os import PathLike
from pathlib import Path
import httpx

from .gnps_format import GNPSFormat
from .gnps_format import gnps_format_from_task_id
from typing_extensions import Self
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_task_id
from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
from nplinker.utils import download_url


class GNPSDownloader:
GNPS_DATA_DOWNLOAD_URL = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra'
GNPS_DATA_DOWNLOAD_URL_FBMN = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data'

def __init__(self, task_id: str, download_root: str | PathLike):
"""Class to download GNPS output archive for the given task id.
"""Download GNPS zip archive for the given task id.
Note that only GNPS workflows listed in the GNPSFormat enum are supported.
Args:
task_id(str): GNPS task id, identifying the data to be downloaded.
download_root(Path): Path where to store the downloaded archive.
Raises:
ValueError: If the given task id does not correspond to a supported
GNPS workflow.
Examples:
>>> GNPSDownloader("c22f44b14a3d450eb836d607cb9521bb", "~/downloads")
"""
"""
gnps_format = gnps_format_from_task_id(task_id)
if gnps_format == GNPSFormat.Unknown:
raise ValueError(
f"Unknown workflow type for GNPS task '{task_id}'."
f"Supported GNPS workflows are described in the GNPSFormat enum, "
f"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' "
f"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.")

self._task_id = task_id
self._download_root: Path = Path(download_root)
self._gnps_format = gnps_format
self._file_name = gnps_format.value + "-" + self._task_id + ".zip"

@property
def gnps_format(self) -> GNPSFormat:
"""Get the GNPS workflow type.
Returns:
GNPSFormat: GNPS workflow type.
"""
return self._gnps_format


def download(self) -> Self:
"""Execute the downloading process. """
with open(self.get_download_path(), 'wb') as f:
with httpx.stream('POST', self.get_url()) as r:
for data in r.iter_bytes():
f.write(data)
"""Execute the downloading process.
Note: GNPS data is downloaded using the POST method (empty payload is OK).
"""
download_url(self.get_url(),
self._download_root,
filename=self._file_name,
http_method="POST")
return self
def get_download_path(self) -> str:
"""Get the path where to store the downloaded file.

def get_download_file(self) -> str:
"""Get the path to the zip file.
Returns:
str: Download path as string
"""
return str(self._download_root.joinpath(self._task_id + ".zip"))
return str(Path(self._download_root) / self._file_name)

def get_task_id(self) -> str:
"""Get the GNPS task id.
Returns:
str: Task id as string.
"""
return self._task_id

def get_url(self) -> str:
"""Get the full URL linking to GNPS data to be dowloaded.
Returns:
str: URL pointing to the GNPS data to be downloaded.
"""

gnps_format = gnps_format_from_task_id(self._task_id)

if gnps_format == GNPSFormat.FBMN:
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id)

if self.gnps_format == GNPSFormat.FBMN:
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(
self._task_id)
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id)



Loading

0 comments on commit 4d230e9

Please sign in to comment.