-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #169 from NPLinker/refactor_gnps_classes
Refactor gnps classes and functions
- Loading branch information
Showing
47 changed files
with
1,147 additions
and
2,035 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,8 +5,8 @@ httpx | |
numpy | ||
pandas | ||
progress | ||
pyteomics | ||
pytest-lazy-fixture | ||
requests | ||
scipy | ||
sortedcontainers | ||
toml | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,36 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
from nplinker.metabolomics.molecular_family import MolecularFamily | ||
from abc import ABC | ||
from abc import abstractmethod | ||
from collections.abc import Sequence | ||
from nplinker.metabolomics.molecular_family import MolecularFamily | ||
from nplinker.metabolomics.spectrum import Spectrum | ||
|
||
|
||
class SpectrumLoaderBase(ABC): | ||
|
||
|
||
@property | ||
@abstractmethod | ||
def spectra(self) -> Sequence[Spectrum]: | ||
... | ||
|
||
class MolecularFamilyLoaderBase(ABC): | ||
|
||
|
||
@property | ||
@abstractmethod | ||
def families(self) -> Sequence[MolecularFamily]: | ||
... | ||
|
||
|
||
class FileMappingLoaderBase(ABC): | ||
|
||
@property | ||
@abstractmethod | ||
def mapping(self) -> dict[int, list[str]]: | ||
def mappings(self) -> dict[str, list[str]]: | ||
... | ||
|
||
|
||
class AnnotationLoaderBase(ABC): | ||
|
||
@property | ||
@abstractmethod | ||
def get_annotations(self) -> dict[int, dict]: | ||
... | ||
def annotations(self) -> dict[str, dict]: | ||
... |
130 changes: 110 additions & 20 deletions
130
src/nplinker/metabolomics/gnps/gnps_annotation_loader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,132 @@ | ||
import csv | ||
from os import PathLike | ||
from pathlib import Path | ||
from typing import Any | ||
from nplinker.metabolomics.abc import AnnotationLoaderBase | ||
from nplinker.utils import is_file_format | ||
|
||
|
||
GNPS_URL_FORMAT = 'https://metabolomics-usi.ucsd.edu/{}/?usi=mzspec:GNPSLIBRARY:{}' | ||
GNPS_UNIVERSAL_SPECTRUM_IDENTIFIER_URL = "https://metabolomics-usi.gnps2.org/{}/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:{}" | ||
|
||
|
||
class GNPSAnnotationLoader(AnnotationLoaderBase): | ||
|
||
def __init__(self, file: str | PathLike): | ||
"""Load annotations from GNPS output file. | ||
The annotation file is a .tsv file from GNPS output archive, as described | ||
below for each GNPS workflow type: | ||
1. METABOLOMICS-SNETS | ||
- result_specnets_DB/*.tsv | ||
2. METABOLOMICS-SNETS-V2 | ||
- result_specnets_DB/.tsv | ||
3. FEATURE-BASED-MOLECULAR-NETWORKING | ||
- DB_result/*.tsv | ||
Args: | ||
file(str | PathLike): The GNPS annotation file. | ||
Example: | ||
>>> loader = GNPSAnnotationLoader("gnps_annotations.tsv") | ||
>>> print(loader.annotations["100"]) | ||
{'#Scan#': '100', | ||
'Adduct': 'M+H', | ||
'CAS_Number': 'N/A', | ||
'Charge': '1', | ||
'Compound_Name': 'MLS002153841-01!Iobenguane sulfate', | ||
'Compound_Source': 'NIH Pharmacologically Active Library', | ||
'Data_Collector': 'VP/LMS', | ||
'ExactMass': '274.992', | ||
'INCHI': 'N/A', | ||
'INCHI_AUX': 'N/A', | ||
'Instrument': 'qTof', | ||
'IonMode': 'Positive', | ||
'Ion_Source': 'LC-ESI', | ||
'LibMZ': '276.003', | ||
'LibraryName': 'lib-00014.mgf', | ||
'LibraryQualityString': 'Gold', | ||
'Library_Class': '1', | ||
'MQScore': '0.704152', | ||
'MZErrorPPM': '405416', | ||
'MassDiff': '111.896', | ||
'Organism': 'GNPS-NIH-SMALLMOLECULEPHARMACOLOGICALLYACTIVE', | ||
'PI': 'Dorrestein', | ||
'Precursor_MZ': '276.003', | ||
'Pubmed_ID': 'N/A', | ||
'RT_Query': '795.979', | ||
'SharedPeaks': '7', | ||
'Smiles': 'NC(=N)NCc1cccc(I)c1.OS(=O)(=O)O', | ||
'SpecCharge': '1', | ||
'SpecMZ': '164.107', | ||
'SpectrumFile': 'spectra/specs_ms.pklbin', | ||
'SpectrumID': 'CCMSLIB00000086167', | ||
'TIC_Query': '986.997', | ||
'UpdateWorkflowName': 'UPDATE-SINGLE-ANNOTATED-GOLD', | ||
'tags': ' ', | ||
'png_url': 'https://metabolomics-usi.gnps2.org/png/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167', | ||
'json_url': 'https://metabolomics-usi.gnps2.org/json/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167', | ||
'svg_url': 'https://metabolomics-usi.gnps2.org/svg/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167', | ||
'spectrum_url': 'https://metabolomics-usi.gnps2.org/spectrum/?usi1=mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00000086167'} | ||
""" | ||
self._file = Path(file) | ||
self._annotations : dict[str, dict] = {} | ||
self._annotations: dict[str, dict] = {} | ||
|
||
with open(self._file, mode='rt', encoding='UTF-8') as f: | ||
header = f.readline().split('\t') | ||
dict_reader = csv.DictReader(f, header, delimiter='\t') | ||
for row in dict_reader: | ||
scan_id = row.pop('#Scan#') | ||
self._annotations[scan_id] = row | ||
self._validate() | ||
self._load() | ||
|
||
# also insert useful URLs | ||
for t in ['png', 'json', 'svg', 'spectrum']: | ||
self._annotations[scan_id][f'{t}_url'] = GNPS_URL_FORMAT.format(t, row['SpectrumID']) | ||
@property | ||
def annotations(self) -> dict[str, dict]: | ||
"""Get annotations. | ||
Returns: | ||
dict[str, dict]: Keys are spectrum ids ("#Scan#" in annotation file) | ||
and values are the annotations dict for each spectrum. | ||
""" | ||
return self._annotations | ||
|
||
def _validate(self) -> None: | ||
"""Validate the annotation file. | ||
def get_annotations(self) -> dict[str, dict]: | ||
"""Get annotations. | ||
Raises: | ||
ValueError: Raises ValueError if the file is not valid. | ||
""" | ||
# validate file format | ||
if not is_file_format(self._file, 'tsv'): | ||
raise ValueError(f"Invalid GNPS annotation file '{self._file}'. " | ||
f"Expected a .tsv file.") | ||
|
||
Returns: | ||
dict[str, dict]: Spectra indices are keys and values are the annotations for this spectrum. | ||
# validate required columns against the header | ||
required_columns = [ | ||
'#Scan#', 'Compound_Name', 'Organism', 'MQScore', 'SpectrumID' | ||
] | ||
with open(self._file, mode='rt') as f: | ||
header = f.readline() | ||
for k in required_columns: | ||
if k not in header: | ||
raise ValueError( | ||
f"Invalid GNPS annotation file '{self._file}'. " | ||
f"Expected a header line with '{k}' column, " | ||
f"but got '{header}'.") | ||
|
||
Examples: | ||
>>> print(loader.annotations()[100]) | ||
""" | ||
return self._annotations | ||
# validate that "#Scan#" must be unique | ||
with open(self._file, mode='rt') as f: | ||
reader = csv.DictReader(f, delimiter='\t') | ||
scans = [row["#Scan#"] for row in reader] | ||
duplicates = {x for x in scans if scans.count(x) > 1} | ||
if len(duplicates) > 0: | ||
raise ValueError( | ||
f"Invalid GNPS annotation file '{self._file}'. " | ||
f"Expected unique '#Scan#', but found duplicates '{duplicates}'." | ||
) | ||
|
||
def _load(self) -> None: | ||
"""Load the annotations from the file.""" | ||
with open(self._file, mode='rt') as f: | ||
dict_reader = csv.DictReader(f, delimiter='\t') | ||
for row in dict_reader: | ||
scan_id = row['#Scan#'] | ||
self._annotations[scan_id] = row | ||
# insert useful URLs | ||
for t in ['png', 'json', 'svg', 'spectrum']: | ||
self._annotations[scan_id][ | ||
f'{t}_url'] = GNPS_UNIVERSAL_SPECTRUM_IDENTIFIER_URL.format( | ||
t, row['SpectrumID']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,67 +1,88 @@ | ||
from os import PathLike | ||
from pathlib import Path | ||
import httpx | ||
|
||
from .gnps_format import GNPSFormat | ||
from .gnps_format import gnps_format_from_task_id | ||
from typing_extensions import Self | ||
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_task_id | ||
from nplinker.metabolomics.gnps.gnps_format import GNPSFormat | ||
from nplinker.utils import download_url | ||
|
||
|
||
class GNPSDownloader: | ||
GNPS_DATA_DOWNLOAD_URL = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra' | ||
GNPS_DATA_DOWNLOAD_URL_FBMN = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_cytoscape_data' | ||
|
||
def __init__(self, task_id: str, download_root: str | PathLike): | ||
"""Class to download GNPS output archive for the given task id. | ||
"""Download GNPS zip archive for the given task id. | ||
Note that only GNPS workflows listed in the GNPSFormat enum are supported. | ||
Args: | ||
task_id(str): GNPS task id, identifying the data to be downloaded. | ||
download_root(Path): Path where to store the downloaded archive. | ||
Raises: | ||
ValueError: If the given task id does not correspond to a supported | ||
GNPS workflow. | ||
Examples: | ||
>>> GNPSDownloader("c22f44b14a3d450eb836d607cb9521bb", "~/downloads") | ||
""" | ||
""" | ||
gnps_format = gnps_format_from_task_id(task_id) | ||
if gnps_format == GNPSFormat.Unknown: | ||
raise ValueError( | ||
f"Unknown workflow type for GNPS task '{task_id}'." | ||
f"Supported GNPS workflows are described in the GNPSFormat enum, " | ||
f"including such as 'METABOLOMICS-SNETS', 'METABOLOMICS-SNETS-V2' " | ||
f"and 'FEATURE-BASED-MOLECULAR-NETWORKING'.") | ||
|
||
self._task_id = task_id | ||
self._download_root: Path = Path(download_root) | ||
self._gnps_format = gnps_format | ||
self._file_name = gnps_format.value + "-" + self._task_id + ".zip" | ||
|
||
@property | ||
def gnps_format(self) -> GNPSFormat: | ||
"""Get the GNPS workflow type. | ||
Returns: | ||
GNPSFormat: GNPS workflow type. | ||
""" | ||
return self._gnps_format | ||
|
||
|
||
def download(self) -> Self: | ||
"""Execute the downloading process. """ | ||
with open(self.get_download_path(), 'wb') as f: | ||
with httpx.stream('POST', self.get_url()) as r: | ||
for data in r.iter_bytes(): | ||
f.write(data) | ||
"""Execute the downloading process. | ||
Note: GNPS data is downloaded using the POST method (empty payload is OK). | ||
""" | ||
download_url(self.get_url(), | ||
self._download_root, | ||
filename=self._file_name, | ||
http_method="POST") | ||
return self | ||
def get_download_path(self) -> str: | ||
"""Get the path where to store the downloaded file. | ||
|
||
def get_download_file(self) -> str: | ||
"""Get the path to the zip file. | ||
Returns: | ||
str: Download path as string | ||
""" | ||
return str(self._download_root.joinpath(self._task_id + ".zip")) | ||
return str(Path(self._download_root) / self._file_name) | ||
|
||
def get_task_id(self) -> str: | ||
"""Get the GNPS task id. | ||
Returns: | ||
str: Task id as string. | ||
""" | ||
return self._task_id | ||
|
||
def get_url(self) -> str: | ||
"""Get the full URL linking to GNPS data to be dowloaded. | ||
Returns: | ||
str: URL pointing to the GNPS data to be downloaded. | ||
""" | ||
|
||
gnps_format = gnps_format_from_task_id(self._task_id) | ||
|
||
if gnps_format == GNPSFormat.FBMN: | ||
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format(self._task_id) | ||
|
||
if self.gnps_format == GNPSFormat.FBMN: | ||
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL_FBMN.format( | ||
self._task_id) | ||
return GNPSDownloader.GNPS_DATA_DOWNLOAD_URL.format(self._task_id) | ||
|
||
|
||
|
Oops, something went wrong.