diff --git a/src/nplinker/metabolomics/gnps/gnps_format.py b/src/nplinker/metabolomics/gnps/gnps_format.py index fc65990b..3f28a357 100644 --- a/src/nplinker/metabolomics/gnps/gnps_format.py +++ b/src/nplinker/metabolomics/gnps/gnps_format.py @@ -63,7 +63,11 @@ def gnps_format_from_file_mapping(file: str | PathLike, has_quant_table: bool) - def gnps_format_from_task_id(task_id: str) -> GNPSFormat: - """Detect the GNPS format for the given task id + """Detect the GNPS format for the given task id. + + The http request has a timeout of 5 seconds. If the request fails, + an ReadTimeout exception is raised. This is to prevent the program + from hanging indefinitely when the GNPS server is down. Args: task_id(str): GNPS task id. @@ -71,11 +75,11 @@ def gnps_format_from_task_id(task_id: str) -> GNPSFormat: Returns: GNPSFormat: the format used in the GNPS workflow invocation. - Examples: + Examples: >>> gnps_format_from_task_id("92036537c21b44c29e509291e53f6382") """ - task_html = requests.get(GNPS_TASK_URL.format(task_id)) - soup = BeautifulSoup(task_html.text) + task_html = requests.get(GNPS_TASK_URL.format(task_id), timeout=5) + soup = BeautifulSoup(task_html.text, features="html.parser") tags = soup.find_all('th') workflow_tag: Tag = list(filter(lambda x: x.contents == ['Workflow'], tags))[0] workflow_format_tag: Tag = workflow_tag.parent.contents[3] @@ -83,11 +87,10 @@ def gnps_format_from_task_id(task_id: str) -> GNPSFormat: if workflow_format == "FEATURE-BASED-MOLECULAR-NETWORKING": return GNPSFormat.FBMN - elif workflow_format == "METABOLOMICS-SNETS": + if workflow_format == "METABOLOMICS-SNETS": return GNPSFormat.AllFiles - else: - return GNPSFormat.Unknown - + return GNPSFormat.Unknown + def gnps_format_from_archive(archive: zipfile.ZipFile) -> GNPSFormat: """Detect GNPS format from a downloaded archive. @@ -106,4 +109,4 @@ def gnps_format_from_archive(archive: zipfile.ZipFile) -> GNPSFormat: return GNPSFormat.FBMN elif any(["METABOLOMICS-SNETS" in x for x in filenames]): return GNPSFormat.AllFiles - return GNPSFormat.Unknown \ No newline at end of file + return GNPSFormat.Unknown diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py index 0e9ce528..15437ae8 100644 --- a/src/nplinker/pairedomics/downloader.py +++ b/src/nplinker/pairedomics/downloader.py @@ -2,10 +2,7 @@ import os import shutil import sys -import zipfile -from deprecated import deprecated import httpx -from progress.spinner import Spinner from nplinker.genomics.mibig import download_and_extract_mibig_metadata from nplinker.logconfig import LogConfig from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader @@ -15,6 +12,7 @@ from . import podp_download_and_extract_antismash_data from .runbigscape import podp_run_bigscape + logger = LogConfig.getLogger(__name__) PAIREDOMICS_PROJECT_DATA_ENDPOINT = 'https://pairedomicsdata.bioinformatics.nl/api/projects' @@ -250,66 +248,6 @@ def _download_metabolomics_zipfile(self, gnps_task_id): self.project_download_cache).download().get_download_path() GNPSExtractor(archive, self.project_file_cache).extract() - @deprecated - def _extract_metabolomics_data(self, mbzip): - logger.info('Extracting files to %s', self.project_file_cache) - # extract the contents to the file cache folder. only want some of the files - # so pick them out and only extract those: - # - root/spectra/*.mgf - # - root/clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv - # - root/networkedges_selfloop/*.pairsinfo - # - root/quantification_table* - # - root/metadata_table* - # - root/DB_result* - - prefixes = [ - 'clusterinfosummarygroup_attributes_withIDs_withcomponentID', - 'networkedges_selfloop', 'quantification_table', 'metadata_table', - 'DB_result', 'result_specnets_DB' - ] - - for member in mbzip.namelist(): - if any(member.startswith(prefix) for prefix in prefixes): - mbzip.extract(member, path=self.project_file_cache) - # move the MGF file to a /spectra subdirectory to better fit expected structure - elif member.endswith('.mgf'): - os.makedirs(os.path.join(self.project_file_cache, 'spectra'), - exist_ok=True) - mbzip.extract(member, - path=os.path.join(self.project_file_cache, - 'spectra')) - - @deprecated - def _log_gnps_format(self): - if self._is_new_gnps_format(self.project_file_cache): - logger.info('Found NEW GNPS structure') - else: - logger.info('Found OLD GNPS structure') - - @deprecated - def _load_gnps_data(self, gnps_task_id) -> zipfile.ZipFile: - - self.metabolomics_zip = os.path.join(self.project_download_cache, - 'metabolomics_data.zip') - - # Try read from cache - if os.path.exists(self.metabolomics_zip): - logger.info('Found existing metabolomics_zip at %s', - self.metabolomics_zip) - try: - mbzip = zipfile.ZipFile(self.metabolomics_zip) # pylint: disable=consider-using-with - return mbzip - except zipfile.BadZipFile: - logger.info( - 'Invalid metabolomics zipfile found, will download again!') - os.unlink(self.metabolomics_zip) - url = _generate_gnps_download_url(gnps_task_id) - _execute_download(url, self.metabolomics_zip) - - # this should throw an exception if zip is malformed etc - mbzip = zipfile.ZipFile(self.metabolomics_zip) # pylint: disable=consider-using-with - return mbzip - def _download_and_load_json(self, url, local_path): resp = httpx.get(url, follow_redirects=True) if not resp.status_code == 200: @@ -323,25 +261,3 @@ def _download_and_load_json(self, url, local_path): logger.debug('Downloaded %s to %s', url, local_path) return content - - -@deprecated -def _generate_gnps_download_url(gnps_task_id): - url = GNPS_DATA_DOWNLOAD_URL.format(gnps_task_id) - return url - - -@deprecated -def _execute_download(url, metabolomics_zip): - logger.info('Downloading metabolomics data from %s', url) - with open(metabolomics_zip, 'wb') as f: - # note that this requires a POST, not a GET - total_bytes = 0 - spinner = Spinner('Downloading metabolomics data... ') - with httpx.stream('POST', url) as r: - for data in r.iter_bytes(): - f.write(data) - total_bytes += len(data) - spinner.next() - spinner.finish() - logger.info('Downloaded metabolomics data!') diff --git a/tests/metabolomics/test_gnps_downloader.py b/tests/metabolomics/test_gnps_downloader.py index 394dcf32..0ebde8ce 100644 --- a/tests/metabolomics/test_gnps_downloader.py +++ b/tests/metabolomics/test_gnps_downloader.py @@ -1,10 +1,10 @@ import filecmp +from pathlib import Path from tempfile import gettempdir import zipfile -from pathlib import Path -from typing_extensions import Self - import pytest +from requests.exceptions import ReadTimeout +from typing_extensions import Self from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader from .. import DATA_DIR @@ -33,8 +33,11 @@ def test_has_gnps_task_id(): def test_has_url(): - sut = GNPSDownloaderBuilder().with_task_id("c22f44b14a3d450eb836d607cb9521bb").build() - assert sut.get_url() == 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra' + try: + sut = GNPSDownloaderBuilder().with_task_id("c22f44b14a3d450eb836d607cb9521bb").build() + assert sut.get_url() == 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra' + except ReadTimeout: + pytest.skip("GNPS is down") @pytest.mark.parametrize("task_id, filename_expected", [ @@ -44,11 +47,14 @@ def test_has_url(): def test_downloads_file(tmp_path: Path, task_id, filename_expected): outpath = tmp_path.joinpath(task_id + ".zip") sut = GNPSDownloader(task_id, tmp_path) - sut.download() - actual = zipfile.ZipFile(outpath) + try: + sut.download() + actual = zipfile.ZipFile(outpath) - expected = zipfile.ZipFile(DATA_DIR / filename_expected) + expected = zipfile.ZipFile(DATA_DIR / filename_expected) - actual_names = actual.namelist() - expected_names = [x.filename for x in expected.filelist if x.compress_size > 0] - assert all(item in actual_names for item in expected_names) + actual_names = actual.namelist() + expected_names = [x.filename for x in expected.filelist if x.compress_size > 0] + assert all(item in actual_names for item in expected_names) + except ReadTimeout: + pytest.skip("GNPS is down") diff --git a/tests/metabolomics/test_gnps_format.py b/tests/metabolomics/test_gnps_format.py index 58dc162b..2a972fd0 100644 --- a/tests/metabolomics/test_gnps_format.py +++ b/tests/metabolomics/test_gnps_format.py @@ -1,10 +1,10 @@ import zipfile import pytest - -from nplinker.metabolomics.gnps.gnps_format import GNPSFormat +from requests.exceptions import ReadTimeout +from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_archive from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_file_mapping from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_task_id -from nplinker.metabolomics.gnps.gnps_format import gnps_format_from_archive +from nplinker.metabolomics.gnps.gnps_format import GNPSFormat from .. import DATA_DIR @@ -24,8 +24,11 @@ def test_identify_gnps_format(filename, expected): ["c22f44b14a3d450eb836d607cb9521bb", GNPSFormat.AllFiles] ]) def test_gnps_format_from_task_id(task_id: str, expected: GNPSFormat): - actual = gnps_format_from_task_id(task_id) - assert actual is expected + try: + actual = gnps_format_from_task_id(task_id) + assert actual is expected + except ReadTimeout: + pytest.skip("GNPS is down") @pytest.mark.parametrize("archive_path, expected", [ ["ProteoSAFe-FEATURE-BASED-MOLECULAR-NETWORKING-92036537-download_cytoscape_data.zip", GNPSFormat.FBMN], diff --git a/tests/pairedomics/test_downloader.py b/tests/pairedomics/test_downloader.py index 1ade1623..f084ae5a 100644 --- a/tests/pairedomics/test_downloader.py +++ b/tests/pairedomics/test_downloader.py @@ -1,22 +1,12 @@ -import filecmp import os from pathlib import Path -import zipfile -import numpy import pytest from pytest_lazyfixture import lazy_fixture -from nplinker import utils -from nplinker.pairedomics.downloader import _execute_download -from nplinker.pairedomics.downloader import _generate_gnps_download_url +from requests.exceptions import ReadTimeout from nplinker.pairedomics.downloader import PODPDownloader from nplinker.pairedomics.downloader import STRAIN_MAPPINGS_FILENAME -from .. import DATA_DIR -@pytest.fixture -def gnps_url(): - return _generate_gnps_download_url("c22f44b14a3d450eb836d607cb9521bb") - @pytest.mark.parametrize("expected", [ Path(os.getenv('HOME'), 'nplinker_data', 'pairedomics'), lazy_fixture('tmp_path') @@ -43,61 +33,27 @@ def test_default(expected: Path): def test_download_metabolomics_zipfile(tmp_path): sut = PODPDownloader("MSV000079284", local_cache=tmp_path) - sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb") - expected_path = os.path.join(sut.project_download_cache, 'metabolomics_data.zip') - - assert os.path.exists(expected_path) - assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file() - assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file() - assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file() - - -def test_download_metabolomics_zipfile(tmp_path): - sut = PODPDownloader("MSV000079284", local_cache=tmp_path) - sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb") - expected_path = os.path.join(sut.project_download_cache, 'c22f44b14a3d450eb836d607cb9521bb.zip') - - assert os.path.exists(expected_path) - assert (Path(sut.project_file_cache) / "molecular_families.pairsinfo").is_file() - assert (Path(sut.project_file_cache) / "file_mappings.tsv").is_file() - assert (Path(sut.project_file_cache) / "spectra.mgf").is_file() - + try: + sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb") + expected_path = os.path.join(sut.project_download_cache, 'metabolomics_data.zip') -def test_generate_gnps_download_url(): - gnps_task_id = "c22f44b14a3d450eb836d607cb9521bb" - expected = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=c22f44b14a3d450eb836d607cb9521bb&view=download_clustered_spectra' - actual = _generate_gnps_download_url(gnps_task_id) - assert actual == expected + assert os.path.exists(expected_path) + assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file() + assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file() + assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file() + except ReadTimeout: + pytest.skip("GNPS is down") -def test_execute_download(gnps_url: str, tmp_path: Path): - outpath = tmp_path / 'metabolomics_data.zip' - _execute_download(gnps_url, outpath) - assert os.path.exists(outpath) - - -def test_download_gnps_data(tmp_path): - gnps_task_id = "c22f44b14a3d450eb836d607cb9521bb" - sut = PODPDownloader("MSV000079284", local_cache=tmp_path / 'actual') - actual = sut._load_gnps_data(gnps_task_id) - - expected = zipfile.ZipFile(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip") - - actual.extract("networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo", tmp_path / "actual") - expected.extract("networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo", tmp_path / "expected") - - assert filecmp.cmp( - tmp_path / "actual/networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo", - tmp_path / "expected/networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo", - shallow=False - ) - - -def test_extract_metabolomics_data(tmp_path): +def test_download_metabolomics_zipfile_scenario2(tmp_path): sut = PODPDownloader("MSV000079284", local_cache=tmp_path) - archive = zipfile.ZipFile(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip") - sut._extract_metabolomics_data(archive) - - assert (Path(sut.project_file_cache) / "networkedges_selfloop/6da5be36f5b14e878860167fa07004d6.pairsinfo").is_file() - assert (Path(sut.project_file_cache) / "clusterinfosummarygroup_attributes_withIDs_withcomponentID/d69356c8e5044c2a9fef3dd2a2f991e1.tsv").is_file() - assert (Path(sut.project_file_cache) / "spectra/METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf").is_file() + try: + sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb") + expected_path = os.path.join(sut.project_download_cache, 'c22f44b14a3d450eb836d607cb9521bb.zip') + + assert os.path.exists(expected_path) + assert (Path(sut.project_file_cache) / "molecular_families.pairsinfo").is_file() + assert (Path(sut.project_file_cache) / "file_mappings.tsv").is_file() + assert (Path(sut.project_file_cache) / "spectra.mgf").is_file() + except ReadTimeout: + pytest.skip("GNPS is down")