Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle unavailable GNPS service #159

Merged
merged 4 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions src/nplinker/metabolomics/gnps/gnps_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,31 +63,34 @@ def gnps_format_from_file_mapping(file: str | PathLike, has_quant_table: bool) -


def gnps_format_from_task_id(task_id: str) -> GNPSFormat:
"""Detect the GNPS format for the given task id
"""Detect the GNPS format for the given task id.

The http request has a timeout of 5 seconds. If the request fails,
an ReadTimeout exception is raised. This is to prevent the program
from hanging indefinitely when the GNPS server is down.

Args:
task_id(str): GNPS task id.

Returns:
GNPSFormat: the format used in the GNPS workflow invocation.

Examples:
Examples:
>>> gnps_format_from_task_id("92036537c21b44c29e509291e53f6382")
"""
task_html = requests.get(GNPS_TASK_URL.format(task_id))
soup = BeautifulSoup(task_html.text)
task_html = requests.get(GNPS_TASK_URL.format(task_id), timeout=5)
soup = BeautifulSoup(task_html.text, features="html.parser")
tags = soup.find_all('th')
workflow_tag: Tag = list(filter(lambda x: x.contents == ['Workflow'], tags))[0]
workflow_format_tag: Tag = workflow_tag.parent.contents[3]
workflow_format = workflow_format_tag.contents[0].strip()

if workflow_format == "FEATURE-BASED-MOLECULAR-NETWORKING":
return GNPSFormat.FBMN
elif workflow_format == "METABOLOMICS-SNETS":
if workflow_format == "METABOLOMICS-SNETS":
return GNPSFormat.AllFiles
else:
return GNPSFormat.Unknown

return GNPSFormat.Unknown


def gnps_format_from_archive(archive: zipfile.ZipFile) -> GNPSFormat:
"""Detect GNPS format from a downloaded archive.
Expand All @@ -106,4 +109,4 @@ def gnps_format_from_archive(archive: zipfile.ZipFile) -> GNPSFormat:
return GNPSFormat.FBMN
elif any(["METABOLOMICS-SNETS" in x for x in filenames]):
return GNPSFormat.AllFiles
return GNPSFormat.Unknown
return GNPSFormat.Unknown
86 changes: 1 addition & 85 deletions src/nplinker/pairedomics/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
import os
import shutil
import sys
import zipfile
from deprecated import deprecated
import httpx
from progress.spinner import Spinner
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader
Expand All @@ -15,6 +12,7 @@
from . import podp_download_and_extract_antismash_data
from .runbigscape import podp_run_bigscape


logger = LogConfig.getLogger(__name__)

PAIREDOMICS_PROJECT_DATA_ENDPOINT = 'https://pairedomicsdata.bioinformatics.nl/api/projects'
Expand Down Expand Up @@ -250,66 +248,6 @@ def _download_metabolomics_zipfile(self, gnps_task_id):
self.project_download_cache).download().get_download_path()
GNPSExtractor(archive, self.project_file_cache).extract()

@deprecated
def _extract_metabolomics_data(self, mbzip):
logger.info('Extracting files to %s', self.project_file_cache)
# extract the contents to the file cache folder. only want some of the files
# so pick them out and only extract those:
# - root/spectra/*.mgf
# - root/clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv
# - root/networkedges_selfloop/*.pairsinfo
# - root/quantification_table*
# - root/metadata_table*
# - root/DB_result*

prefixes = [
'clusterinfosummarygroup_attributes_withIDs_withcomponentID',
'networkedges_selfloop', 'quantification_table', 'metadata_table',
'DB_result', 'result_specnets_DB'
]

for member in mbzip.namelist():
if any(member.startswith(prefix) for prefix in prefixes):
mbzip.extract(member, path=self.project_file_cache)
# move the MGF file to a /spectra subdirectory to better fit expected structure
elif member.endswith('.mgf'):
os.makedirs(os.path.join(self.project_file_cache, 'spectra'),
exist_ok=True)
mbzip.extract(member,
path=os.path.join(self.project_file_cache,
'spectra'))

@deprecated
def _log_gnps_format(self):
if self._is_new_gnps_format(self.project_file_cache):
logger.info('Found NEW GNPS structure')
else:
logger.info('Found OLD GNPS structure')

@deprecated
def _load_gnps_data(self, gnps_task_id) -> zipfile.ZipFile:

self.metabolomics_zip = os.path.join(self.project_download_cache,
'metabolomics_data.zip')

# Try read from cache
if os.path.exists(self.metabolomics_zip):
logger.info('Found existing metabolomics_zip at %s',
self.metabolomics_zip)
try:
mbzip = zipfile.ZipFile(self.metabolomics_zip) # pylint: disable=consider-using-with
return mbzip
except zipfile.BadZipFile:
logger.info(
'Invalid metabolomics zipfile found, will download again!')
os.unlink(self.metabolomics_zip)
url = _generate_gnps_download_url(gnps_task_id)
_execute_download(url, self.metabolomics_zip)

# this should throw an exception if zip is malformed etc
mbzip = zipfile.ZipFile(self.metabolomics_zip) # pylint: disable=consider-using-with
return mbzip

def _download_and_load_json(self, url, local_path):
resp = httpx.get(url, follow_redirects=True)
if not resp.status_code == 200:
Expand All @@ -323,25 +261,3 @@ def _download_and_load_json(self, url, local_path):
logger.debug('Downloaded %s to %s', url, local_path)

return content


@deprecated
def _generate_gnps_download_url(gnps_task_id):
url = GNPS_DATA_DOWNLOAD_URL.format(gnps_task_id)
return url


@deprecated
def _execute_download(url, metabolomics_zip):
logger.info('Downloading metabolomics data from %s', url)
with open(metabolomics_zip, 'wb') as f:
# note that this requires a POST, not a GET
total_bytes = 0
spinner = Spinner('Downloading metabolomics data... ')
with httpx.stream('POST', url) as r:
for data in r.iter_bytes():
f.write(data)
total_bytes += len(data)
spinner.next()
spinner.finish()
logger.info('Downloaded metabolomics data!')
28 changes: 17 additions & 11 deletions tests/metabolomics/test_gnps_downloader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import filecmp
from pathlib import Path
from tempfile import gettempdir
import zipfile
from pathlib import Path
from typing_extensions import Self

import pytest
from requests.exceptions import ReadTimeout
from typing_extensions import Self
from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader
from .. import DATA_DIR

Expand Down Expand Up @@ -33,8 +33,11 @@ def test_has_gnps_task_id():


def test_has_url():
sut = GNPSDownloaderBuilder().with_task_id("c22f44b14a3d450eb836d607cb9521bb").build()
assert sut.get_url() == 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra'
try:
sut = GNPSDownloaderBuilder().with_task_id("c22f44b14a3d450eb836d607cb9521bb").build()
assert sut.get_url() == 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra'
except ReadTimeout:
pytest.skip("GNPS is down")


@pytest.mark.parametrize("task_id, filename_expected", [
Expand All @@ -44,11 +47,14 @@ def test_has_url():
def test_downloads_file(tmp_path: Path, task_id, filename_expected):
outpath = tmp_path.joinpath(task_id + ".zip")
sut = GNPSDownloader(task_id, tmp_path)
sut.download()
actual = zipfile.ZipFile(outpath)
try:
sut.download()
actual = zipfile.ZipFile(outpath)

expected = zipfile.ZipFile(DATA_DIR / filename_expected)
expected = zipfile.ZipFile(DATA_DIR / filename_expected)

actual_names = actual.namelist()
expected_names = [x.filename for x in expected.filelist if x.compress_size > 0]
assert all(item in actual_names for item in expected_names)
actual_names = actual.namelist()
expected_names = [x.filename for x in expected.filelist if x.compress_size > 0]
assert all(item in actual_names for item in expected_names)
except ReadTimeout:
pytest.skip("GNPS is down")
13 changes: 8 additions & 5 deletions tests/metabolomics/test_gnps_format.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import zipfile
import pytest

from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
from requests.exceptions import ReadTimeout
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_archive
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_task_id
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_archive
from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
from .. import DATA_DIR


Expand All @@ -24,8 +24,11 @@ def test_identify_gnps_format(filename, expected):
["c22f44b14a3d450eb836d607cb9521bb", GNPSFormat.AllFiles]
])
def test_gnps_format_from_task_id(task_id: str, expected: GNPSFormat):
actual = gnps_format_from_task_id(task_id)
assert actual is expected
try:
actual = gnps_format_from_task_id(task_id)
assert actual is expected
except ReadTimeout:
pytest.skip("GNPS is down")

@pytest.mark.parametrize("archive_path, expected", [
["ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-92036537-download_cytoscape_data.zip", GNPSFormat.FBMN],
Expand Down
86 changes: 21 additions & 65 deletions tests/pairedomics/test_downloader.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
import filecmp
import os
from pathlib import Path
import zipfile
import numpy
import pytest
from pytest_lazyfixture import lazy_fixture
from nplinker import utils
from nplinker.pairedomics.downloader import _execute_download
from nplinker.pairedomics.downloader import _generate_gnps_download_url
from requests.exceptions import ReadTimeout
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
from .. import DATA_DIR


@pytest.fixture
def gnps_url():
return _generate_gnps_download_url("c22f44b14a3d450eb836d607cb9521bb")

@pytest.mark.parametrize("expected", [
Path(os.getenv('HOME'), 'nplinker_data', 'pairedomics'),
lazy_fixture('tmp_path')
Expand All @@ -43,61 +33,27 @@ def test_default(expected: Path):

def test_download_metabolomics_zipfile(tmp_path):
sut = PODPDownloader("MSV000079284", local_cache=tmp_path)
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'metabolomics_data.zip')

assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file()


def test_download_metabolomics_zipfile(tmp_path):
sut = PODPDownloader("MSV000079284", local_cache=tmp_path)
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'c22f44b14a3d450eb836d607cb9521bb.zip')

assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "molecular_families.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "file_mappings.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra.mgf").is_file()

try:
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'metabolomics_data.zip')

def test_generate_gnps_download_url():
gnps_task_id = "c22f44b14a3d450eb836d607cb9521bb"
expected = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra'
actual = _generate_gnps_download_url(gnps_task_id)
assert actual == expected
assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file()
except ReadTimeout:
pytest.skip("GNPS is down")


def test_execute_download(gnps_url: str, tmp_path: Path):
outpath = tmp_path / 'metabolomics_data.zip'
_execute_download(gnps_url, outpath)
assert os.path.exists(outpath)


def test_download_gnps_data(tmp_path):
gnps_task_id = "c22f44b14a3d450eb836d607cb9521bb"
sut = PODPDownloader("MSV000079284", local_cache=tmp_path / 'actual')
actual = sut._load_gnps_data(gnps_task_id)

expected = zipfile.ZipFile(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip")

actual.extract("networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo", tmp_path / "actual")
expected.extract("networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo", tmp_path / "expected")

assert filecmp.cmp(
tmp_path / "actual/networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo",
tmp_path / "expected/networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo",
shallow=False
)


def test_extract_metabolomics_data(tmp_path):
def test_download_metabolomics_zipfile_scenario2(tmp_path):
sut = PODPDownloader("MSV000079284", local_cache=tmp_path)
archive = zipfile.ZipFile(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip")
sut._extract_metabolomics_data(archive)

assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file()
try:
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'c22f44b14a3d450eb836d607cb9521bb.zip')

assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "molecular_families.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "file_mappings.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra.mgf").is_file()
except ReadTimeout:
pytest.skip("GNPS is down")
Loading