Skip to content

Commit

Permalink
Merge pull request #159 from NPLinker/add_readtimeout_for_down_gnps
Browse files Browse the repository at this point in the history
Handle unavailable GNPS service
  • Loading branch information
CunliangGeng authored Jul 11, 2023
2 parents bea0af6 + f43e662 commit aad7a52
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 175 deletions.
21 changes: 12 additions & 9 deletions src/nplinker/metabolomics/gnps/gnps_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,31 +63,34 @@ def gnps_format_from_file_mapping(file: str | PathLike, has_quant_table: bool) -


def gnps_format_from_task_id(task_id: str) -> GNPSFormat:
"""Detect the GNPS format for the given task id
"""Detect the GNPS format for the given task id.
The http request has a timeout of 5 seconds. If the request fails,
an ReadTimeout exception is raised. This is to prevent the program
from hanging indefinitely when the GNPS server is down.
Args:
task_id(str): GNPS task id.
Returns:
GNPSFormat: the format used in the GNPS workflow invocation.
Examples:
Examples:
>>> gnps_format_from_task_id("92036537c21b44c29e509291e53f6382")
"""
task_html = requests.get(GNPS_TASK_URL.format(task_id))
soup = BeautifulSoup(task_html.text)
task_html = requests.get(GNPS_TASK_URL.format(task_id), timeout=5)
soup = BeautifulSoup(task_html.text, features="html.parser")
tags = soup.find_all('th')
workflow_tag: Tag = list(filter(lambda x: x.contents == ['Workflow'], tags))[0]
workflow_format_tag: Tag = workflow_tag.parent.contents[3]
workflow_format = workflow_format_tag.contents[0].strip()

if workflow_format == "FEATURE-BASED-MOLECULAR-NETWORKING":
return GNPSFormat.FBMN
elif workflow_format == "METABOLOMICS-SNETS":
if workflow_format == "METABOLOMICS-SNETS":
return GNPSFormat.AllFiles
else:
return GNPSFormat.Unknown

return GNPSFormat.Unknown


def gnps_format_from_archive(archive: zipfile.ZipFile) -> GNPSFormat:
"""Detect GNPS format from a downloaded archive.
Expand All @@ -106,4 +109,4 @@ def gnps_format_from_archive(archive: zipfile.ZipFile) -> GNPSFormat:
return GNPSFormat.FBMN
elif any(["METABOLOMICS-SNETS" in x for x in filenames]):
return GNPSFormat.AllFiles
return GNPSFormat.Unknown
return GNPSFormat.Unknown
86 changes: 1 addition & 85 deletions src/nplinker/pairedomics/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
import os
import shutil
import sys
import zipfile
from deprecated import deprecated
import httpx
from progress.spinner import Spinner
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader
Expand All @@ -15,6 +12,7 @@
from . import podp_download_and_extract_antismash_data
from .runbigscape import podp_run_bigscape


logger = LogConfig.getLogger(__name__)

PAIREDOMICS_PROJECT_DATA_ENDPOINT = 'https://pairedomicsdata.bioinformatics.nl/api/projects'
Expand Down Expand Up @@ -250,66 +248,6 @@ def _download_metabolomics_zipfile(self, gnps_task_id):
self.project_download_cache).download().get_download_path()
GNPSExtractor(archive, self.project_file_cache).extract()

@deprecated
def _extract_metabolomics_data(self, mbzip):
logger.info('Extracting files to %s', self.project_file_cache)
# extract the contents to the file cache folder. only want some of the files
# so pick them out and only extract those:
# - root/spectra/*.mgf
# - root/clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv
# - root/networkedges_selfloop/*.pairsinfo
# - root/quantification_table*
# - root/metadata_table*
# - root/DB_result*

prefixes = [
'clusterinfosummarygroup_attributes_withIDs_withcomponentID',
'networkedges_selfloop', 'quantification_table', 'metadata_table',
'DB_result', 'result_specnets_DB'
]

for member in mbzip.namelist():
if any(member.startswith(prefix) for prefix in prefixes):
mbzip.extract(member, path=self.project_file_cache)
# move the MGF file to a /spectra subdirectory to better fit expected structure
elif member.endswith('.mgf'):
os.makedirs(os.path.join(self.project_file_cache, 'spectra'),
exist_ok=True)
mbzip.extract(member,
path=os.path.join(self.project_file_cache,
'spectra'))

@deprecated
def _log_gnps_format(self):
if self._is_new_gnps_format(self.project_file_cache):
logger.info('Found NEW GNPS structure')
else:
logger.info('Found OLD GNPS structure')

@deprecated
def _load_gnps_data(self, gnps_task_id) -> zipfile.ZipFile:

self.metabolomics_zip = os.path.join(self.project_download_cache,
'metabolomics_data.zip')

# Try read from cache
if os.path.exists(self.metabolomics_zip):
logger.info('Found existing metabolomics_zip at %s',
self.metabolomics_zip)
try:
mbzip = zipfile.ZipFile(self.metabolomics_zip) # pylint: disable=consider-using-with
return mbzip
except zipfile.BadZipFile:
logger.info(
'Invalid metabolomics zipfile found, will download again!')
os.unlink(self.metabolomics_zip)
url = _generate_gnps_download_url(gnps_task_id)
_execute_download(url, self.metabolomics_zip)

# this should throw an exception if zip is malformed etc
mbzip = zipfile.ZipFile(self.metabolomics_zip) # pylint: disable=consider-using-with
return mbzip

def _download_and_load_json(self, url, local_path):
resp = httpx.get(url, follow_redirects=True)
if not resp.status_code == 200:
Expand All @@ -323,25 +261,3 @@ def _download_and_load_json(self, url, local_path):
logger.debug('Downloaded %s to %s', url, local_path)

return content


@deprecated
def _generate_gnps_download_url(gnps_task_id):
url = GNPS_DATA_DOWNLOAD_URL.format(gnps_task_id)
return url


@deprecated
def _execute_download(url, metabolomics_zip):
logger.info('Downloading metabolomics data from %s', url)
with open(metabolomics_zip, 'wb') as f:
# note that this requires a POST, not a GET
total_bytes = 0
spinner = Spinner('Downloading metabolomics data... ')
with httpx.stream('POST', url) as r:
for data in r.iter_bytes():
f.write(data)
total_bytes += len(data)
spinner.next()
spinner.finish()
logger.info('Downloaded metabolomics data!')
28 changes: 17 additions & 11 deletions tests/metabolomics/test_gnps_downloader.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import filecmp
from pathlib import Path
from tempfile import gettempdir
import zipfile
from pathlib import Path
from typing_extensions import Self

import pytest
from requests.exceptions import ReadTimeout
from typing_extensions import Self
from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader
from .. import DATA_DIR

Expand Down Expand Up @@ -33,8 +33,11 @@ def test_has_gnps_task_id():


def test_has_url():
sut = GNPSDownloaderBuilder().with_task_id("c22f44b14a3d450eb836d607cb9521bb").build()
assert sut.get_url() == 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra'
try:
sut = GNPSDownloaderBuilder().with_task_id("c22f44b14a3d450eb836d607cb9521bb").build()
assert sut.get_url() == 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra'
except ReadTimeout:
pytest.skip("GNPS is down")


@pytest.mark.parametrize("task_id, filename_expected", [
Expand All @@ -44,11 +47,14 @@ def test_has_url():
def test_downloads_file(tmp_path: Path, task_id, filename_expected):
outpath = tmp_path.joinpath(task_id + ".zip")
sut = GNPSDownloader(task_id, tmp_path)
sut.download()
actual = zipfile.ZipFile(outpath)
try:
sut.download()
actual = zipfile.ZipFile(outpath)

expected = zipfile.ZipFile(DATA_DIR / filename_expected)
expected = zipfile.ZipFile(DATA_DIR / filename_expected)

actual_names = actual.namelist()
expected_names = [x.filename for x in expected.filelist if x.compress_size > 0]
assert all(item in actual_names for item in expected_names)
actual_names = actual.namelist()
expected_names = [x.filename for x in expected.filelist if x.compress_size > 0]
assert all(item in actual_names for item in expected_names)
except ReadTimeout:
pytest.skip("GNPS is down")
13 changes: 8 additions & 5 deletions tests/metabolomics/test_gnps_format.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import zipfile
import pytest

from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
from requests.exceptions import ReadTimeout
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_archive
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_task_id
from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_archive
from nplinker.metabolomics.gnps.gnps_format import GNPSFormat
from .. import DATA_DIR


Expand All @@ -24,8 +24,11 @@ def test_identify_gnps_format(filename, expected):
["c22f44b14a3d450eb836d607cb9521bb", GNPSFormat.AllFiles]
])
def test_gnps_format_from_task_id(task_id: str, expected: GNPSFormat):
actual = gnps_format_from_task_id(task_id)
assert actual is expected
try:
actual = gnps_format_from_task_id(task_id)
assert actual is expected
except ReadTimeout:
pytest.skip("GNPS is down")

@pytest.mark.parametrize("archive_path, expected", [
["ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-92036537-download_cytoscape_data.zip", GNPSFormat.FBMN],
Expand Down
86 changes: 21 additions & 65 deletions tests/pairedomics/test_downloader.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,12 @@
import filecmp
import os
from pathlib import Path
import zipfile
import numpy
import pytest
from pytest_lazyfixture import lazy_fixture
from nplinker import utils
from nplinker.pairedomics.downloader import _execute_download
from nplinker.pairedomics.downloader import _generate_gnps_download_url
from requests.exceptions import ReadTimeout
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME
from .. import DATA_DIR


@pytest.fixture
def gnps_url():
return _generate_gnps_download_url("c22f44b14a3d450eb836d607cb9521bb")

@pytest.mark.parametrize("expected", [
Path(os.getenv('HOME'), 'nplinker_data', 'pairedomics'),
lazy_fixture('tmp_path')
Expand All @@ -43,61 +33,27 @@ def test_default(expected: Path):

def test_download_metabolomics_zipfile(tmp_path):
sut = PODPDownloader("MSV000079284", local_cache=tmp_path)
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'metabolomics_data.zip')

assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file()


def test_download_metabolomics_zipfile(tmp_path):
sut = PODPDownloader("MSV000079284", local_cache=tmp_path)
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'c22f44b14a3d450eb836d607cb9521bb.zip')

assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "molecular_families.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "file_mappings.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra.mgf").is_file()

try:
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'metabolomics_data.zip')

def test_generate_gnps_download_url():
gnps_task_id = "c22f44b14a3d450eb836d607cb9521bb"
expected = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra'
actual = _generate_gnps_download_url(gnps_task_id)
assert actual == expected
assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file()
except ReadTimeout:
pytest.skip("GNPS is down")


def test_execute_download(gnps_url: str, tmp_path: Path):
outpath = tmp_path / 'metabolomics_data.zip'
_execute_download(gnps_url, outpath)
assert os.path.exists(outpath)


def test_download_gnps_data(tmp_path):
gnps_task_id = "c22f44b14a3d450eb836d607cb9521bb"
sut = PODPDownloader("MSV000079284", local_cache=tmp_path / 'actual')
actual = sut._load_gnps_data(gnps_task_id)

expected = zipfile.ZipFile(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip")

actual.extract("networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo", tmp_path / "actual")
expected.extract("networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo", tmp_path / "expected")

assert filecmp.cmp(
tmp_path / "actual/networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo",
tmp_path / "expected/networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo",
shallow=False
)


def test_extract_metabolomics_data(tmp_path):
def test_download_metabolomics_zipfile_scenario2(tmp_path):
sut = PODPDownloader("MSV000079284", local_cache=tmp_path)
archive = zipfile.ZipFile(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip")
sut._extract_metabolomics_data(archive)

assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file()
try:
sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
expected_path = os.path.join(sut.project_download_cache, 'c22f44b14a3d450eb836d607cb9521bb.zip')

assert os.path.exists(expected_path)
assert (Path(sut.project_file_cache) / "molecular_families.pairsinfo").is_file()
assert (Path(sut.project_file_cache) / "file_mappings.tsv").is_file()
assert (Path(sut.project_file_cache) / "spectra.mgf").is_file()
except ReadTimeout:
pytest.skip("GNPS is down")

0 comments on commit aad7a52

Please sign in to comment.