From 2f7ea53b8cf7470dc376bdcebeb3bed0ff2aba2a Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Sat, 8 Feb 2025 18:12:07 -0500 Subject: [PATCH] mv figshare.py to remote/figshare.py and extract download_file from data.py to new remote/fetch.py --- .pre-commit-config.yaml | 4 +- matbench_discovery/data.py | 44 -------- matbench_discovery/enums.py | 9 +- matbench_discovery/remote/fetch.py | 49 +++++++++ matbench_discovery/{ => remote}/figshare.py | 0 scripts/upload_data_files_to_figshare.py | 2 +- scripts/upload_model_preds_to_figshare.py | 2 +- tests/remote/test_fetch.py | 112 ++++++++++++++++++++ tests/{ => remote}/test_figshare.py | 13 +-- tests/test_data.py | 106 ------------------ tests/test_enums.py | 2 +- 11 files changed, 174 insertions(+), 169 deletions(-) create mode 100644 matbench_discovery/remote/fetch.py rename matbench_discovery/{ => remote}/figshare.py (100%) create mode 100644 tests/remote/test_fetch.py rename tests/{ => remote}/test_figshare.py (95%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d9dc8f98..4095ad9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ default_install_hook_types: [pre-commit, commit-msg] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.4 + rev: v0.9.5 hooks: - id: ruff args: [--fix] @@ -57,7 +57,7 @@ repos: exclude: ^(site/src/figs/.+\.svelte|data/wbm/20.+\..+|site/src/(routes|figs).+\.(yaml|json)|changelog.md)$ - repo: https://github.com/pre-commit/mirrors-eslint - rev: v9.19.0 + rev: v9.20.0 hooks: - id: eslint types: [file] diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py index 1eae0240..f2dfc424 100644 --- a/matbench_discovery/data.py +++ b/matbench_discovery/data.py @@ -11,11 +11,9 @@ ~/.cache/matbench-discovery. """ -import builtins import io import os import sys -import traceback import zipfile from collections import defaultdict from collections.abc import Callable, Sequence @@ -25,7 +23,6 @@ import ase.io import pandas as pd -import requests import yaml from ase import Atoms from pymatviz.enums import Key @@ -197,47 +194,6 @@ def ase_atoms_to_zip( zip_file.writestr(f"{mat_id}.extxyz", buffer.getvalue()) -def download_file(file_path: str, url: str) -> None: - """Download the file from the given URL to the given file path. - Prints rather than raises if the file cannot be downloaded. - """ - file_dir = os.path.dirname(file_path) - os.makedirs(file_dir, exist_ok=True) - try: - response = requests.get(url, timeout=5) - - response.raise_for_status() - - with open(file_path, "wb") as file: - file.write(response.content) - except requests.exceptions.RequestException: - print(f"Error downloading {url=}\nto {file_path=}.\n{traceback.format_exc()}") - - -def maybe_auto_download_file(url: str, abs_path: str, label: str | None = None) -> None: - """Download file if not exist and user confirms or auto-download is enabled.""" - if os.path.isfile(abs_path): - return - - # whether to auto-download model prediction files without prompting - auto_download_files = os.getenv("MBD_AUTO_DOWNLOAD_FILES", "true").lower() == "true" - - is_ipython = hasattr(builtins, "__IPYTHON__") - # default to 'y' if auto-download enabled or not in interactive session (TTY - # or iPython) - answer = ( - "y" - if auto_download_files or not (is_ipython or sys.stdin.isatty()) - else input( - f"{abs_path!r} associated with {label=} does not exist. Download it " - "now? This will cache the file for future use. [y/n] " - ) - ) - if answer.lower().strip() == "y": - print(f"Downloading {label!r} from {url!r} to {abs_path!r}") - download_file(abs_path, url) - - df_wbm = pd.read_csv(DataFiles.wbm_summary.path) # str() around Key.mat_id added for https://github.com/janosh/matbench-discovery/issues/81 df_wbm.index = df_wbm[str(Key.mat_id)] diff --git a/matbench_discovery/enums.py b/matbench_discovery/enums.py index fd59f976..8071efdb 100644 --- a/matbench_discovery/enums.py +++ b/matbench_discovery/enums.py @@ -13,6 +13,7 @@ import yaml from matbench_discovery import DEFAULT_CACHE_DIR, PKG_DIR, ROOT +from matbench_discovery.remote.fetch import download_file, maybe_auto_download_file eV_per_atom = pmv.enums.eV_per_atom # noqa: N816 T = TypeVar("T", bound="Files") @@ -358,8 +359,6 @@ def yaml_path(self) -> str: @property def discovery_path(self) -> str: """Prediction file path associated with the model.""" - from matbench_discovery.data import maybe_auto_download_file - rel_path = self.metrics.get("discovery", {}).get("pred_file") file_url = self.metrics.get("discovery", {}).get("pred_file_url") if not rel_path: @@ -375,8 +374,6 @@ def geo_opt_path(self) -> str | None: """File path associated with the file URL if it exists, otherwise download the file first, then return the path. """ - from matbench_discovery.data import maybe_auto_download_file - geo_opt_metrics = self.metrics.get("geo_opt", {}) if geo_opt_metrics in ("not available", "not applicable"): return None @@ -395,8 +392,6 @@ def kappa_103_path(self) -> str | None: """File path associated with the file URL if it exists, otherwise download the file first, then return the path. """ - from matbench_discovery.data import maybe_auto_download_file - phonons_metrics = self.metrics.get("phonons", {}) if phonons_metrics in ("not available", "not applicable"): return None @@ -495,8 +490,6 @@ def path(self) -> str: """File path associated with the file URL if it exists, otherwise download the file first, then return the path. """ - from matbench_discovery.data import download_file - key, rel_path = self.name, self.rel_path if rel_path not in self.yaml[key]["path"]: diff --git a/matbench_discovery/remote/fetch.py b/matbench_discovery/remote/fetch.py new file mode 100644 index 00000000..dacc6173 --- /dev/null +++ b/matbench_discovery/remote/fetch.py @@ -0,0 +1,49 @@ +"""Files download functions.""" + +import builtins +import os +import sys +import traceback + +import requests + + +def download_file(file_path: str, url: str) -> None: + """Download the file from the given URL to the given file path. + Prints rather than raises if the file cannot be downloaded. + """ + file_dir = os.path.dirname(file_path) + os.makedirs(file_dir, exist_ok=True) + try: + response = requests.get(url, timeout=5) + + response.raise_for_status() + + with open(file_path, "wb") as file: + file.write(response.content) + except requests.exceptions.RequestException: + print(f"Error downloading {url=}\nto {file_path=}.\n{traceback.format_exc()}") + + +def maybe_auto_download_file(url: str, abs_path: str, label: str | None = None) -> None: + """Download file if not exist and user confirms or auto-download is enabled.""" + if os.path.isfile(abs_path): + return + + # whether to auto-download model prediction files without prompting + auto_download_files = os.getenv("MBD_AUTO_DOWNLOAD_FILES", "true").lower() == "true" + + is_ipython = hasattr(builtins, "__IPYTHON__") + # default to 'y' if auto-download enabled or not in interactive session (TTY + # or iPython) + answer = ( + "y" + if auto_download_files or not (is_ipython or sys.stdin.isatty()) + else input( + f"{abs_path!r} associated with {label=} does not exist. Download it " + "now? This will cache the file for future use. [y/n] " + ) + ) + if answer.lower().strip() == "y": + print(f"Downloading {label!r} from {url!r} to {abs_path!r}") + download_file(abs_path, url) diff --git a/matbench_discovery/figshare.py b/matbench_discovery/remote/figshare.py similarity index 100% rename from matbench_discovery/figshare.py rename to matbench_discovery/remote/figshare.py diff --git a/scripts/upload_data_files_to_figshare.py b/scripts/upload_data_files_to_figshare.py index 725f2b2c..89077a04 100644 --- a/scripts/upload_data_files_to_figshare.py +++ b/scripts/upload_data_files_to_figshare.py @@ -6,7 +6,7 @@ from tqdm import tqdm -import matbench_discovery.figshare as figshare +import matbench_discovery.remote.figshare as figshare from matbench_discovery import DATA_DIR, PKG_DIR, ROOT from matbench_discovery.data import round_trip_yaml from matbench_discovery.enums import DataFiles diff --git a/scripts/upload_model_preds_to_figshare.py b/scripts/upload_model_preds_to_figshare.py index 3c90885e..b420bb69 100644 --- a/scripts/upload_model_preds_to_figshare.py +++ b/scripts/upload_model_preds_to_figshare.py @@ -15,7 +15,7 @@ import yaml from tqdm import tqdm -import matbench_discovery.figshare as figshare +import matbench_discovery.remote.figshare as figshare from matbench_discovery import PKG_DIR, ROOT from matbench_discovery.data import round_trip_yaml from matbench_discovery.enums import Model diff --git a/tests/remote/test_fetch.py b/tests/remote/test_fetch.py new file mode 100644 index 00000000..004bdefc --- /dev/null +++ b/tests/remote/test_fetch.py @@ -0,0 +1,112 @@ +import os +from pathlib import Path +from unittest.mock import patch + +import pytest +import requests + +from matbench_discovery.remote.fetch import maybe_auto_download_file + + +def test_download_file(tmp_path: Path, capsys: pytest.CaptureFixture) -> None: + """Test download_file function.""" + + from matbench_discovery.remote.fetch import download_file + + url = "https://example.com/test.txt" + test_content = b"test content" + dest_path = tmp_path / "test.txt" + + # Mock successful request + mock_response = requests.Response() + mock_response.status_code = 200 + mock_response._content = test_content # noqa: SLF001 + + with patch("requests.get", return_value=mock_response): + download_file(str(dest_path), url) + assert dest_path.read_bytes() == test_content + + # Mock failed request + mock_response = requests.Response() + mock_response.status_code = 404 + mock_response._content = b"Not found" # noqa: SLF001 + + with patch("requests.get", return_value=mock_response): + download_file(str(dest_path), url) # Should print error but not raise + + stdout, stderr = capsys.readouterr() + assert f"Error downloading {url=}" in stdout + assert stderr == "" + + +def test_maybe_auto_download_file( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture +) -> None: + """Test auto-download behavior of maybe_auto_download_file function.""" + url = "https://example.com/file.txt" + abs_path = f"{tmp_path}/test/file.txt" + os.makedirs(os.path.dirname(abs_path), exist_ok=True) + + # Mock successful request + mock_response = requests.Response() + mock_response.status_code = 200 + mock_response._content = b"test content" # noqa: SLF001 + + # Test 1: Auto-download enabled (default) + monkeypatch.setenv("MBD_AUTO_DOWNLOAD_FILES", "true") + with patch("requests.get", return_value=mock_response): + maybe_auto_download_file(url, abs_path, label="test") + stdout, _ = capsys.readouterr() + assert f"Downloading 'test' from {url!r}" in stdout + assert os.path.isfile(abs_path) + + # Test 2: Auto-download disabled + os.remove(abs_path) + monkeypatch.setenv("MBD_AUTO_DOWNLOAD_FILES", "false") + assert not os.path.isfile(abs_path) + + # Mock user input 'n' to skip download + with ( + patch("requests.get", return_value=mock_response), + patch("builtins.input", return_value="n"), + patch("sys.stdin.isatty", return_value=True), # force interactive mode + ): + maybe_auto_download_file(url, abs_path, label="test") + assert not os.path.isfile(abs_path) + + # Test 3: Auto-download disabled but user confirms + with ( + patch("requests.get", return_value=mock_response), + patch("builtins.input", return_value="y"), + patch("sys.stdin.isatty", return_value=True), # force interactive mode + ): + maybe_auto_download_file(url, abs_path, label="test") + stdout, _ = capsys.readouterr() + assert f"Downloading 'test' from {url!r}" in stdout + assert os.path.isfile(abs_path) + + # Test 4: File already exists (no download attempt) + with patch("requests.get") as mock_get: + maybe_auto_download_file(url, abs_path, label="test") + mock_get.assert_not_called() + + # Test 5: Non-interactive session (auto-download) + os.remove(abs_path) + with ( + patch("requests.get", return_value=mock_response), + patch("sys.stdin.isatty", return_value=False), + ): + maybe_auto_download_file(url, abs_path, label="test") + stdout, _ = capsys.readouterr() + assert f"Downloading 'test' from {url!r}" in stdout + assert os.path.isfile(abs_path) + + # Test 6: IPython session with auto-download disabled + os.remove(abs_path) + with ( + patch("requests.get", return_value=mock_response), + patch("builtins.input", return_value="n"), + patch("sys.stdin.isatty", return_value=True), # force interactive mode + ): + maybe_auto_download_file(url, abs_path, label="test") + assert not os.path.isfile(abs_path) diff --git a/tests/test_figshare.py b/tests/remote/test_figshare.py similarity index 95% rename from tests/test_figshare.py rename to tests/remote/test_figshare.py index e84edc63..48969999 100644 --- a/tests/test_figshare.py +++ b/tests/remote/test_figshare.py @@ -7,7 +7,7 @@ import pytest import requests -import matbench_discovery.figshare as figshare +import matbench_discovery.remote.figshare as figshare @pytest.mark.parametrize( @@ -96,7 +96,7 @@ def test_create_article_variants( ) -> None: """Test article creation with different metadata combinations.""" with patch( - "matbench_discovery.figshare.make_request", + "matbench_discovery.remote.figshare.make_request", side_effect=[{"location": "loc"}, {"id": article_id}], ): assert figshare.create_article(metadata, verbose=True) == article_id @@ -194,9 +194,10 @@ def mock_make_request(method: str, url: str, **kwargs: Any) -> Any: return mock_responses[method] with ( - patch("matbench_discovery.figshare.ROOT", str(tmp_path)), + patch("matbench_discovery.remote.figshare.ROOT", str(tmp_path)), patch( - "matbench_discovery.figshare.make_request", side_effect=mock_make_request + "matbench_discovery.remote.figshare.make_request", + side_effect=mock_make_request, ), ): assert figshare.upload_file(12345, str(test_file), file_name=file_name) == 67890 @@ -211,7 +212,7 @@ def mock_make_request(method: str, url: str, **kwargs: Any) -> Any: @pytest.mark.parametrize("files", [[], DUMMY_FILES]) # Empty and non-empty def test_list_article_files(files: list[dict[str, Any]]) -> None: """Test list_article_files with various file configurations.""" - with patch("matbench_discovery.figshare.make_request", return_value=files): + with patch("matbench_discovery.remote.figshare.make_request", return_value=files): assert figshare.list_article_files(12345) == files @@ -273,7 +274,7 @@ def test_get_existing_files( files: list[dict[str, Any]], expected: dict[str, dict[str, Any]] ) -> None: """Test get_existing_files with various file configurations.""" - with patch("matbench_discovery.figshare.make_request", return_value=files): + with patch("matbench_discovery.remote.figshare.make_request", return_value=files): assert figshare.get_existing_files(12345) == expected diff --git a/tests/test_data.py b/tests/test_data.py index bd177764..e9dad0ad 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd import pytest -import requests from ase import Atoms from pymatgen.core import Lattice, Structure from pymatviz.enums import Key @@ -20,7 +19,6 @@ df_wbm, glob_to_df, load_df_wbm_with_preds, - maybe_auto_download_file, ) from matbench_discovery.enums import MbdKey, Model, TestSubset @@ -193,37 +191,6 @@ def test_ase_atoms_from_zip_with_limit(tmp_path: Path) -> None: assert len(read_atoms) == 2 -def test_download_file(tmp_path: Path, capsys: pytest.CaptureFixture) -> None: - """Test download_file function.""" - - from matbench_discovery.data import download_file - - url = "https://example.com/test.txt" - test_content = b"test content" - dest_path = tmp_path / "test.txt" - - # Mock successful request - mock_response = requests.Response() - mock_response.status_code = 200 - mock_response._content = test_content # noqa: SLF001 - - with patch("requests.get", return_value=mock_response): - download_file(str(dest_path), url) - assert dest_path.read_bytes() == test_content - - # Mock failed request - mock_response = requests.Response() - mock_response.status_code = 404 - mock_response._content = b"Not found" # noqa: SLF001 - - with patch("requests.get", return_value=mock_response): - download_file(str(dest_path), url) # Should print error but not raise - - stdout, stderr = capsys.readouterr() - assert f"Error downloading {url=}" in stdout - assert stderr == "" - - @pytest.mark.skipif( "CI" in os.environ, reason="CI uses mock data so don't check length against on-the-fly " @@ -304,76 +271,3 @@ def test_load_df_wbm_with_preds_subset(subset: Any) -> None: """Test subset handling in load_df_wbm_with_preds.""" df_wbm = load_df_wbm_with_preds(subset=subset) assert isinstance(df_wbm, pd.DataFrame) - - -def test_maybe_auto_download_file( - tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture -) -> None: - """Test auto-download behavior of maybe_auto_download_file function.""" - url = "https://example.com/file.txt" - abs_path = f"{tmp_path}/test/file.txt" - os.makedirs(os.path.dirname(abs_path), exist_ok=True) - - # Mock successful request - mock_response = requests.Response() - mock_response.status_code = 200 - mock_response._content = b"test content" # noqa: SLF001 - - # Test 1: Auto-download enabled (default) - monkeypatch.setenv("MBD_AUTO_DOWNLOAD_FILES", "true") - with patch("requests.get", return_value=mock_response): - maybe_auto_download_file(url, abs_path, label="test") - stdout, _ = capsys.readouterr() - assert f"Downloading 'test' from {url!r}" in stdout - assert os.path.isfile(abs_path) - - # Test 2: Auto-download disabled - os.remove(abs_path) - monkeypatch.setenv("MBD_AUTO_DOWNLOAD_FILES", "false") - assert not os.path.isfile(abs_path) - - # Mock user input 'n' to skip download - with ( - patch("requests.get", return_value=mock_response), - patch("builtins.input", return_value="n"), - patch("sys.stdin.isatty", return_value=True), # force interactive mode - ): - maybe_auto_download_file(url, abs_path, label="test") - assert not os.path.isfile(abs_path) - - # Test 3: Auto-download disabled but user confirms - with ( - patch("requests.get", return_value=mock_response), - patch("builtins.input", return_value="y"), - patch("sys.stdin.isatty", return_value=True), # force interactive mode - ): - maybe_auto_download_file(url, abs_path, label="test") - stdout, _ = capsys.readouterr() - assert f"Downloading 'test' from {url!r}" in stdout - assert os.path.isfile(abs_path) - - # Test 4: File already exists (no download attempt) - with patch("requests.get") as mock_get: - maybe_auto_download_file(url, abs_path, label="test") - mock_get.assert_not_called() - - # Test 5: Non-interactive session (auto-download) - os.remove(abs_path) - with ( - patch("requests.get", return_value=mock_response), - patch("sys.stdin.isatty", return_value=False), - ): - maybe_auto_download_file(url, abs_path, label="test") - stdout, _ = capsys.readouterr() - assert f"Downloading 'test' from {url!r}" in stdout - assert os.path.isfile(abs_path) - - # Test 6: IPython session with auto-download disabled - os.remove(abs_path) - with ( - patch("requests.get", return_value=mock_response), - patch("builtins.input", return_value="n"), - patch("sys.stdin.isatty", return_value=True), # force interactive mode - ): - maybe_auto_download_file(url, abs_path, label="test") - assert not os.path.isfile(abs_path) diff --git a/tests/test_enums.py b/tests/test_enums.py index ec8c126d..20b0fe45 100644 --- a/tests/test_enums.py +++ b/tests/test_enums.py @@ -9,7 +9,6 @@ import requests from matbench_discovery import DATA_DIR -from matbench_discovery.data import maybe_auto_download_file from matbench_discovery.enums import ( DataFiles, Files, @@ -20,6 +19,7 @@ Task, TestSubset, ) +from matbench_discovery.remote.fetch import maybe_auto_download_file def test_mbd_key() -> None: