From 3ad919ff1ac065f073afac87d87c9a577bdace63 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Tue, 25 Nov 2025 15:01:43 -0500 Subject: [PATCH 1/8] Tar extract switch from shutil to tar and use subprocess --- .../tests/fixtures/integration_test_logs.py | 76 +++++++++++++------ 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/integration-tests/tests/fixtures/integration_test_logs.py b/integration-tests/tests/fixtures/integration_test_logs.py index 0a876f72e5..08a696ef2e 100644 --- a/integration-tests/tests/fixtures/integration_test_logs.py +++ b/integration-tests/tests/fixtures/integration_test_logs.py @@ -10,7 +10,6 @@ IntegrationTestLogs, IntegrationTestPathConfig, ) -from tests.utils.utils import unlink logger = logging.getLogger(__name__) @@ -48,7 +47,18 @@ def _download_and_extract_dataset( integration_test_path_config: IntegrationTestPathConfig, name: str, tarball_url: str, + has_leading_directory_component: bool = True, ) -> IntegrationTestLogs: + """ + :param request: + :param integration_test_path_config: + "param name: + :param tarball_url: + :param has_leading_directory_component: Whether all files inside the tarball are stored under a + single top level directory. Defaults to True. + :return: The IntegrationTestPathConfig object with its associated logs properly downloaded, + extracted, and permission changed to be overritable. + """ integration_test_logs = IntegrationTestLogs( name=name, tarball_url=tarball_url, @@ -58,38 +68,56 @@ def _download_and_extract_dataset( logger.info("Test logs `%s` are up-to-date. Skipping download.", name) return integration_test_logs + integration_test_logs.tarball_path.unlink(missing_ok=True) + shutil.rmtree(integration_test_logs.extraction_dir) + integration_test_logs.extraction_dir.mkdir(parents=True, exist_ok=False) + + tarball_path_str = str(integration_test_logs.tarball_path) + extract_path_str = str(integration_test_logs.extraction_dir) + curl_bin = shutil.which("curl") if curl_bin is None: err_msg = "curl executable not found" raise RuntimeError(err_msg) - try: - # fmt: off - curl_cmds = [ - curl_bin, - "--fail", - "--location", - "--output", str(integration_test_logs.tarball_path), - "--show-error", - tarball_url, - ] - # fmt: on - subprocess.run(curl_cmds, check=True) - - unlink(integration_test_logs.extraction_dir) - shutil.unpack_archive( - integration_test_logs.tarball_path, integration_test_logs.extraction_dir - ) - except Exception as e: - err_msg = f"Failed to download and extract dataset `{name}`." - raise RuntimeError(err_msg) from e - - # Allow the extracted content to be deletable or overwritable + # fmt: off + curl_cmd = [ + curl_bin, + "--fail", + "--location", + "--output", tarball_path_str, + "--show-error", + tarball_url, + ] + # fmt: on + subprocess.run(curl_cmd, check=True) + + tar_bin = shutil.which("tar") + if tar_bin is None: + err_msg = "tar executable not found" + raise RuntimeError(err_msg) + + # fmt: off + extract_cmd = [ + tar_bin, + "--extract", + "--gzip", + "--file", tarball_path_str, + "-C", extract_path_str, + ] + # fmt: on + if has_leading_directory_component: + extract_cmd.extend(["--strip-components", "1"]) + subprocess.run(extract_cmd, check=True) + chmod_bin = shutil.which("chmod") if chmod_bin is None: err_msg = "chmod executable not found" raise RuntimeError(err_msg) - subprocess.run([chmod_bin, "-R", "gu+w", integration_test_logs.extraction_dir], check=True) + + # Allow the downloaded and extracted contents to be deletable or overwritable + subprocess.run([chmod_bin, "-R", "gu+w", tarball_path_str], check=True) + subprocess.run([chmod_bin, "-R", "gu+w", extract_path_str], check=True) logger.info("Downloaded and extracted uncompressed logs for dataset `%s`.", name) request.config.cache.set(name, True) From 82464e82d95b32fe3c4c3cda37533a79067b9e59 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Feb 2026 19:18:12 +0000 Subject: [PATCH 2/8] Update docstring and add remove_path helper --- .../tests/fixtures/integration_test_logs.py | 23 ++++++++++--------- integration-tests/tests/utils/utils.py | 17 ++++++++++++++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/integration-tests/tests/fixtures/integration_test_logs.py b/integration-tests/tests/fixtures/integration_test_logs.py index 08a696ef2e..21e07df177 100644 --- a/integration-tests/tests/fixtures/integration_test_logs.py +++ b/integration-tests/tests/fixtures/integration_test_logs.py @@ -10,6 +10,7 @@ IntegrationTestLogs, IntegrationTestPathConfig, ) +from tests.utils.utils import remove_path logger = logging.getLogger(__name__) @@ -47,17 +48,17 @@ def _download_and_extract_dataset( integration_test_path_config: IntegrationTestPathConfig, name: str, tarball_url: str, - has_leading_directory_component: bool = True, + strip_leading_dir: bool = True, ) -> IntegrationTestLogs: """ - :param request: - :param integration_test_path_config: - "param name: - :param tarball_url: - :param has_leading_directory_component: Whether all files inside the tarball are stored under a - single top level directory. Defaults to True. - :return: The IntegrationTestPathConfig object with its associated logs properly downloaded, - extracted, and permission changed to be overritable. + :param request: Provides access to the pytest cache. + :param integration_test_path_config: See `IntegrationTestPathConfig`. + :param name: Dataset name. + :param tarball_url: Dataset tarball URL. + :param strip_leading_dir: Whether to strip a single top-level directory from the tarball + contents. Defaults to True. + :return: An IntegrationTestLogs instance with the dataset downloaded, extracted, and file + permissions adjusted for test use. """ integration_test_logs = IntegrationTestLogs( name=name, @@ -68,8 +69,8 @@ def _download_and_extract_dataset( logger.info("Test logs `%s` are up-to-date. Skipping download.", name) return integration_test_logs - integration_test_logs.tarball_path.unlink(missing_ok=True) - shutil.rmtree(integration_test_logs.extraction_dir) + remove_path(integration_test_logs.tarball_path) + remove_path(integration_test_logs.extraction_dir) integration_test_logs.extraction_dir.mkdir(parents=True, exist_ok=False) tarball_path_str = str(integration_test_logs.tarball_path) diff --git a/integration-tests/tests/utils/utils.py b/integration-tests/tests/utils/utils.py index 6d856c5551..ab6c52b969 100644 --- a/integration-tests/tests/utils/utils.py +++ b/integration-tests/tests/utils/utils.py @@ -128,3 +128,20 @@ def _sort_json_keys_and_rows(json_fp: Path) -> IO[str]: sorted_fp.flush() sorted_fp.seek(0) return sorted_fp + + +def remove_path(path_to_remove: Path) -> None: + """ + Remove a file, directory, or symlink at `path_to_remove` if it exists. + + :param path_to_remove: + :raise: Propagates `pathlib.Path.unlink`'s exceptions. + :raise: Propagates `shutil.rmtree`'s exceptions. + """ + if not path_to_remove.exists(): + return + + if path_to_remove.is_dir() and not path_to_remove.is_symlink(): + shutil.rmtree(path_to_remove) + else: + path_to_remove.unlink() From 519edddbf16db0010e60d6625d212fc161bcff14 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Feb 2026 19:19:14 +0000 Subject: [PATCH 3/8] Move remove_path location --- integration-tests/tests/utils/utils.py | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/integration-tests/tests/utils/utils.py b/integration-tests/tests/utils/utils.py index ced8287d9d..2636f4a49b 100644 --- a/integration-tests/tests/utils/utils.py +++ b/integration-tests/tests/utils/utils.py @@ -93,6 +93,23 @@ def load_yaml_to_dict(path: Path) -> dict[str, Any]: return target_dict +def remove_path(path_to_remove: Path) -> None: + """ + Remove a file, directory, or symlink at `path_to_remove` if it exists. + + :param path_to_remove: + :raise: Propagates `pathlib.Path.unlink`'s exceptions. + :raise: Propagates `shutil.rmtree`'s exceptions. + """ + if not path_to_remove.exists(): + return + + if path_to_remove.is_dir() and not path_to_remove.is_symlink(): + shutil.rmtree(path_to_remove) + else: + path_to_remove.unlink() + + def resolve_path_env_var(var_name: str) -> Path: """ :param var_name: Name of the environment variable holding a path. @@ -180,20 +197,3 @@ def _sort_json_keys_and_rows(json_fp: Path) -> IO[str]: sorted_fp.flush() sorted_fp.seek(0) return sorted_fp - - -def remove_path(path_to_remove: Path) -> None: - """ - Remove a file, directory, or symlink at `path_to_remove` if it exists. - - :param path_to_remove: - :raise: Propagates `pathlib.Path.unlink`'s exceptions. - :raise: Propagates `shutil.rmtree`'s exceptions. - """ - if not path_to_remove.exists(): - return - - if path_to_remove.is_dir() and not path_to_remove.is_symlink(): - shutil.rmtree(path_to_remove) - else: - path_to_remove.unlink() From 81da2ccb9876c140594c0ca562e31dd92c90a70f Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Mon, 2 Feb 2026 14:38:31 -0500 Subject: [PATCH 4/8] Use up to date helpers --- .../tests/fixtures/integration_test_logs.py | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/integration-tests/tests/fixtures/integration_test_logs.py b/integration-tests/tests/fixtures/integration_test_logs.py index 21e07df177..1b89a0dd92 100644 --- a/integration-tests/tests/fixtures/integration_test_logs.py +++ b/integration-tests/tests/fixtures/integration_test_logs.py @@ -1,7 +1,6 @@ """Session-scoped test log fixtures shared across integration tests.""" import logging -import shutil import subprocess import pytest @@ -10,7 +9,10 @@ IntegrationTestLogs, IntegrationTestPathConfig, ) -from tests.utils.utils import remove_path +from tests.utils.utils import ( + get_binary_path, + remove_path, +) logger = logging.getLogger(__name__) @@ -51,14 +53,17 @@ def _download_and_extract_dataset( strip_leading_dir: bool = True, ) -> IntegrationTestLogs: """ + Download and extract a dataset tarball for setting up the `IntegrationTestLogs` fixture, and + adjust its file permissions for test use. + :param request: Provides access to the pytest cache. :param integration_test_path_config: See `IntegrationTestPathConfig`. :param name: Dataset name. :param tarball_url: Dataset tarball URL. :param strip_leading_dir: Whether to strip a single top-level directory from the tarball contents. Defaults to True. - :return: An IntegrationTestLogs instance with the dataset downloaded, extracted, and file - permissions adjusted for test use. + :return: An IntegrationTestLogs instance describing the downloaded and extracted logs. + :raises subprocess.CalledProcessError: If `curl`, `tar x`, or `chmod` fails. """ integration_test_logs = IntegrationTestLogs( name=name, @@ -76,14 +81,9 @@ def _download_and_extract_dataset( tarball_path_str = str(integration_test_logs.tarball_path) extract_path_str = str(integration_test_logs.extraction_dir) - curl_bin = shutil.which("curl") - if curl_bin is None: - err_msg = "curl executable not found" - raise RuntimeError(err_msg) - # fmt: off curl_cmd = [ - curl_bin, + get_binary_path("curl"), "--fail", "--location", "--output", tarball_path_str, @@ -93,30 +93,21 @@ def _download_and_extract_dataset( # fmt: on subprocess.run(curl_cmd, check=True) - tar_bin = shutil.which("tar") - if tar_bin is None: - err_msg = "tar executable not found" - raise RuntimeError(err_msg) - # fmt: off extract_cmd = [ - tar_bin, + get_binary_path("tar"), "--extract", "--gzip", "--file", tarball_path_str, "-C", extract_path_str, ] # fmt: on - if has_leading_directory_component: + if strip_leading_dir: extract_cmd.extend(["--strip-components", "1"]) subprocess.run(extract_cmd, check=True) - chmod_bin = shutil.which("chmod") - if chmod_bin is None: - err_msg = "chmod executable not found" - raise RuntimeError(err_msg) - # Allow the downloaded and extracted contents to be deletable or overwritable + chmod_bin = get_binary_path("chmod") subprocess.run([chmod_bin, "-R", "gu+w", tarball_path_str], check=True) subprocess.run([chmod_bin, "-R", "gu+w", extract_path_str], check=True) From b83503900d4219294e94c4faeba40f3d08cf6859 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 4 Feb 2026 11:36:20 +0000 Subject: [PATCH 5/8] Address review comments --- integration-tests/tests/fixtures/integration_test_logs.py | 2 +- integration-tests/tests/utils/utils.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/integration-tests/tests/fixtures/integration_test_logs.py b/integration-tests/tests/fixtures/integration_test_logs.py index 1b89a0dd92..3569bb07a1 100644 --- a/integration-tests/tests/fixtures/integration_test_logs.py +++ b/integration-tests/tests/fixtures/integration_test_logs.py @@ -108,7 +108,7 @@ def _download_and_extract_dataset( # Allow the downloaded and extracted contents to be deletable or overwritable chmod_bin = get_binary_path("chmod") - subprocess.run([chmod_bin, "-R", "gu+w", tarball_path_str], check=True) + subprocess.run([chmod_bin, "gu+w", tarball_path_str], check=True) subprocess.run([chmod_bin, "-R", "gu+w", extract_path_str], check=True) logger.info("Downloaded and extracted uncompressed logs for dataset `%s`.", name) diff --git a/integration-tests/tests/utils/utils.py b/integration-tests/tests/utils/utils.py index 2636f4a49b..62597f8a04 100644 --- a/integration-tests/tests/utils/utils.py +++ b/integration-tests/tests/utils/utils.py @@ -101,10 +101,11 @@ def remove_path(path_to_remove: Path) -> None: :raise: Propagates `pathlib.Path.unlink`'s exceptions. :raise: Propagates `shutil.rmtree`'s exceptions. """ - if not path_to_remove.exists(): + if path_to_remove.is_symlink(): + path_to_remove.unlink() + elif not path_to_remove.exists(): return - - if path_to_remove.is_dir() and not path_to_remove.is_symlink(): + elif path_to_remove.is_dir(): shutil.rmtree(path_to_remove) else: path_to_remove.unlink() From a44fa1e8f8e32d7da443413bc92daabd1dbfa321 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 11 Feb 2026 08:42:48 -0500 Subject: [PATCH 6/8] Address review comment --- .../tests/fixtures/integration_test_logs.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/integration-tests/tests/fixtures/integration_test_logs.py b/integration-tests/tests/fixtures/integration_test_logs.py index 3569bb07a1..a3815a656f 100644 --- a/integration-tests/tests/fixtures/integration_test_logs.py +++ b/integration-tests/tests/fixtures/integration_test_logs.py @@ -45,25 +45,25 @@ def postgresql( ) -def _download_and_extract_dataset( +def _download_and_extract_gzip_dataset( request: pytest.FixtureRequest, integration_test_path_config: IntegrationTestPathConfig, name: str, tarball_url: str, - strip_leading_dir: bool = True, + keep_leading_dir: bool = False, ) -> IntegrationTestLogs: """ - Download and extract a dataset tarball for setting up the `IntegrationTestLogs` fixture, and - adjust its file permissions for test use. + Download and extract a gzip-compressed dataset tarball for setting up the `IntegrationTestLogs` + fixture. Adjust its file permissions for test use. :param request: Provides access to the pytest cache. :param integration_test_path_config: See `IntegrationTestPathConfig`. :param name: Dataset name. :param tarball_url: Dataset tarball URL. - :param strip_leading_dir: Whether to strip a single top-level directory from the tarball - contents. Defaults to True. - :return: An IntegrationTestLogs instance describing the downloaded and extracted logs. - :raises subprocess.CalledProcessError: If `curl`, `tar x`, or `chmod` fails. + :param keep_leading_dir: Whether to preserve the top-level directory during tarball extraction. + Defaults to False to avoid an unnecessary extra directory level. + :return: An IntegrationTestLogs instance providing metadata for the downloaded logs. + :raises subprocess.CalledProcessError: If `curl`, `tar`, or `chmod` fails. """ integration_test_logs = IntegrationTestLogs( name=name, @@ -99,14 +99,15 @@ def _download_and_extract_dataset( "--extract", "--gzip", "--file", tarball_path_str, - "-C", extract_path_str, + "--directory", extract_path_str, ] # fmt: on - if strip_leading_dir: + if not keep_leading_dir: extract_cmd.extend(["--strip-components", "1"]) subprocess.run(extract_cmd, check=True) - # Allow the downloaded and extracted contents to be deletable or overwritable + # Allow the downloaded and extracted contents to be deletable or overwritable by adding write + # permissions for both the user and the group. chmod_bin = get_binary_path("chmod") subprocess.run([chmod_bin, "gu+w", tarball_path_str], check=True) subprocess.run([chmod_bin, "-R", "gu+w", extract_path_str], check=True) From de261258ac468759a3030739efbc03f076a7c710 Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 11 Feb 2026 08:49:32 -0500 Subject: [PATCH 7/8] Fix lint --- integration-tests/tests/fixtures/integration_test_logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-tests/tests/fixtures/integration_test_logs.py b/integration-tests/tests/fixtures/integration_test_logs.py index a3815a656f..fe8688c027 100644 --- a/integration-tests/tests/fixtures/integration_test_logs.py +++ b/integration-tests/tests/fixtures/integration_test_logs.py @@ -23,7 +23,7 @@ def hive_24hr( integration_test_path_config: IntegrationTestPathConfig, ) -> IntegrationTestLogs: """Provides shared `hive_24hr` test logs.""" - return _download_and_extract_dataset( + return _download_and_extract_gzip_dataset( request=request, integration_test_path_config=integration_test_path_config, name="hive-24hr", @@ -37,7 +37,7 @@ def postgresql( integration_test_path_config: IntegrationTestPathConfig, ) -> IntegrationTestLogs: """Provides shared `postgresql` test logs.""" - return _download_and_extract_dataset( + return _download_and_extract_gzip_dataset( request=request, integration_test_path_config=integration_test_path_config, name="postgresql", From 62f286b5781622c41e4f00d7e001be6dae7e74fb Mon Sep 17 00:00:00 2001 From: Bingran Hu Date: Wed, 11 Feb 2026 13:27:02 -0500 Subject: [PATCH 8/8] Remove unlink() and replace with remove_path() --- integration-tests/tests/utils/config.py | 6 +++--- integration-tests/tests/utils/utils.py | 24 ------------------------ 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/integration-tests/tests/utils/config.py b/integration-tests/tests/utils/config.py index 76a8e2d43a..b7bca4c3dd 100644 --- a/integration-tests/tests/utils/config.py +++ b/integration-tests/tests/utils/config.py @@ -14,7 +14,7 @@ ) from tests.utils.utils import ( - unlink, + remove_path, validate_dir_exists, validate_file_exists, ) @@ -306,5 +306,5 @@ def __post_init__(self, integration_test_path_config: IntegrationTestPathConfig) def clear_test_outputs(self) -> None: """Remove any existing output directories created by this compression test.""" - unlink(self.compression_dir) - unlink(self.decompression_dir) + remove_path(self.compression_dir) + remove_path(self.decompression_dir) diff --git a/integration-tests/tests/utils/utils.py b/integration-tests/tests/utils/utils.py index 62597f8a04..ad9d272cb3 100644 --- a/integration-tests/tests/utils/utils.py +++ b/integration-tests/tests/utils/utils.py @@ -122,30 +122,6 @@ def resolve_path_env_var(var_name: str) -> Path: return Path(get_env_var(var_name)).expanduser().resolve() -def unlink(rm_path: Path, force: bool = True) -> None: - """ - Remove a file or directory at `path`. - - :param rm_path: - :param force: Whether to force remove with sudo priviledges in case the normal operation fails. - Defaults to True. - """ - try: - shutil.rmtree(rm_path) - except FileNotFoundError: - pass - except PermissionError: - if not force: - raise - - sudo_rm_cmds = ["sudo", "rm", "-rf", str(rm_path)] - try: - subprocess.run(sudo_rm_cmds, check=True) - except subprocess.CalledProcessError as e: - err_msg = f"Failed to remove {rm_path} due to lack of superuser privileges (sudo)." - raise OSError(err_msg) from e - - def validate_dir_exists(dir_path: Path) -> None: """ :param dir_path: