From b39a608629f891420460552b960978f759eeb435 Mon Sep 17 00:00:00 2001 From: Bennett Goble Date: Fri, 24 Mar 2023 12:25:36 -0700 Subject: [PATCH 1/2] Add support for detecting archives by signature Detect installable archive type by "magic number" file signatures if an extension is not present. This fixes downloads from private Github releases, which have no file extension. --- autobuild/autobuild_tool_install.py | 12 ++++--- autobuild/filetype.py | 49 ++++++++++++++++++++++++++++ tests/data/archive.tar.bz2 | Bin 0 -> 174 bytes tests/data/archive.tar.gz | Bin 0 -> 173 bytes tests/data/archive.tar.zst | Bin 0 -> 146 bytes tests/data/archive.zip | Bin 0 -> 330 bytes tests/test_filetype.py | 32 ++++++++++++++++++ 7 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 autobuild/filetype.py create mode 100644 tests/data/archive.tar.bz2 create mode 100644 tests/data/archive.tar.gz create mode 100644 tests/data/archive.tar.zst create mode 100644 tests/data/archive.zip create mode 100644 tests/test_filetype.py diff --git a/autobuild/autobuild_tool_install.py b/autobuild/autobuild_tool_install.py index bb14d5b..a35b7b3 100644 --- a/autobuild/autobuild_tool_install.py +++ b/autobuild/autobuild_tool_install.py @@ -23,6 +23,7 @@ from autobuild import autobuild_base, common, configfile from autobuild.autobuild_tool_source_environment import get_enriched_environment from autobuild.hash_algorithms import verify_hash +from autobuild import filetype logger = logging.getLogger('autobuild.install') @@ -439,12 +440,15 @@ def _default_metadata_for_package(package_file: str, package = None): def open_archive(filename: str) -> tarfile.TarFile | zipfile.ZipFile: - if filename.endswith(".tar.zst"): + f_type = filetype.detect_archive_type(filename) + + if f_type == filetype.ArchiveType.ZST: return common.ZstdTarFile(filename, "r") - elif filename.endswith(".zip"): + + if f_type == filetype.ArchiveType.ZIP: return zipfile.ZipFile(filename, "r") - else: - return tarfile.open(filename, "r") + + return tarfile.open(filename, "r") class ExtractPackageResults: diff --git a/autobuild/filetype.py b/autobuild/filetype.py new file mode 100644 index 0000000..914b3f6 --- /dev/null +++ b/autobuild/filetype.py @@ -0,0 +1,49 @@ +"""Utilities for detecting file types""" + +class ArchiveType: + GZ = "gz" + BZ2 = "bz2" + ZIP = "zip" + ZST = "zst" + + +# File signatures used for sniffing archive type +# https://www.garykessler.net/library/file_sigs.html +_ARCHIVE_MAGIC_NUMBERS = { + b"\x1f\x8b\x08": ArchiveType.GZ, + b"\x42\x5a\x68": ArchiveType.BZ2, + b"\x50\x4b\x03\x04": ArchiveType.ZIP, + b"\x28\xb5\x2f\xfd": ArchiveType.ZST, +} + +_ARCHIVE_MAGIC_NUMBERS_MAX = max(len(x) for x in _ARCHIVE_MAGIC_NUMBERS) + + +def _archive_type_from_signature(filename: str): + """Sniff archive type using file signature""" + with open(filename, "rb") as f: + head = f.read(_ARCHIVE_MAGIC_NUMBERS_MAX) + for magic, f_type in _ARCHIVE_MAGIC_NUMBERS.items(): + if head.startswith(magic): + return f_type + return None + + +def _archive_type_from_extension(filename: str): + if filename.endswith(".tar.gz"): + return ArchiveType.GZ + if filename.endswith(".tar.bz2"): + return ArchiveType.BZ2 + if filename.endswith(".tar.zst"): + return ArchiveType.ZST + if filename.endswith(".zip"): + return ArchiveType.ZIP + return None + + +def detect_archive_type(filename: str): + """Given a filename, detect its ArchiveType using file extension and signature.""" + f_type = _archive_type_from_extension(filename) + if f_type: + return f_type + return _archive_type_from_signature(filename) diff --git a/tests/data/archive.tar.bz2 b/tests/data/archive.tar.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..b61b590c1d989c25058bd14ed76298af85e7a0fc GIT binary patch literal 174 zcmV;f08#%!T4*^jL0KkKSz98>p#T7%e}ux25dc61|9}8M5C~@Fp1=SC00Wz<#4Y8wU$t}hftBgNOevZsS27BFE32HgV^fQ806*QDqk^$Y_#5j6xh cd8cDjV|Sd2ZcEGuI6sTIBAh5lEsCexegf!I9yy~>&mAvD|6B_t~lj@K*fj*<*FKK$BV9l;*LlDn{7Vf zz*Z+-9J6}Y8I_9oWi0vgp3fC}?z3UxMl}uPq@{tk!uP#?X7bD;W9_lS?<6k&xwref z{MD)de&>4q)rv2gYwz{o+RmT+jVEU()}&Q$A>WRLyTG Th>M`){2fe!X{;w1G#D5F^ixWP literal 0 HcmV?d00001 diff --git a/tests/data/archive.tar.zst b/tests/data/archive.tar.zst new file mode 100644 index 0000000000000000000000000000000000000000..dadf6a19c22bf55d4694e289c58557c61af89030 GIT binary patch literal 146 zcmV;D0B!#$wJ-eySS3N|Eax)M|9d4?p>_{~p-Yoj4TC~rGelQ%urZgzyBi$~LiEh@4Z8m}2K~Pc zU77+90GR{uz<@~s1L_YD8gU~Q2m$sNX9$>z2B;khx4=>ZkOaKe3RiamaQvF9Ojz+b AL;wH) literal 0 HcmV?d00001 diff --git a/tests/data/archive.zip b/tests/data/archive.zip new file mode 100644 index 0000000000000000000000000000000000000000..0516b1c7476aaacc6d02f76df642e891682b5232 GIT binary patch literal 330 zcmWIWW@h1H0D&6`6=7folwf6$VMs|VNz@Mw;bdSw{zW$B1Q3^2a5FHnykKTv022Xl z?T3NdukSwlhYzS7f}z?oQgd?h^-3yAz~;;YnsFY}91oBp9fk7zqMQ^xt^jXFCOKwY z&XRySP=MjBBZvugBrC*`7!E`jh3Q0yQH(%CmNeSKj6!lS&`gknG0bFT16j-jg!6&) IR1k*&0Bs^gB>(^b literal 0 HcmV?d00001 diff --git a/tests/test_filetype.py b/tests/test_filetype.py new file mode 100644 index 0000000..3d4360d --- /dev/null +++ b/tests/test_filetype.py @@ -0,0 +1,32 @@ +import shutil +from os import path +from pathlib import Path +from tests.basetest import temp_dir + +import pytest +from autobuild import filetype + + +_DATA_DIR = Path(__file__).parent / "data" + +_ARCHIVE_TEST_CASES = ( + (path.join(_DATA_DIR, "archive.tar.bz2"), filetype.ArchiveType.BZ2), + (path.join(_DATA_DIR, "archive.tar.gz"), filetype.ArchiveType.GZ), + (path.join(_DATA_DIR, "archive.tar.zst"), filetype.ArchiveType.ZST), + (path.join(_DATA_DIR, "archive.zip"), filetype.ArchiveType.ZIP), +) + + +@pytest.mark.parametrize("filename,expected_type", _ARCHIVE_TEST_CASES) +def test_detect_from_extension(filename, expected_type): + f_type = filetype.detect_archive_type(filename) + assert f_type == expected_type + + +@pytest.mark.parametrize("filename,expected_type", _ARCHIVE_TEST_CASES) +def test_detect_from_signature(filename, expected_type): + with temp_dir() as dir: + filename_no_ext = str(Path(dir) / "archive") + shutil.copyfile(filename, filename_no_ext) + f_type = filetype.detect_archive_type(filename_no_ext) + assert f_type == expected_type From d0f4ed1d427c35a9f8c3f466be843e26c5295882 Mon Sep 17 00:00:00 2001 From: Bennett Goble Date: Fri, 24 Mar 2023 13:09:45 -0700 Subject: [PATCH 2/2] Move open_archive to archive_utils module --- autobuild/{filetype.py => archive_utils.py} | 40 ++++++++++++++++++++- autobuild/autobuild_tool_install.py | 20 ++--------- autobuild/autobuild_tool_package.py | 5 ++- autobuild/common.py | 24 ------------- tests/test_filetype.py | 14 ++++---- tests/test_package.py | 4 +-- 6 files changed, 53 insertions(+), 54 deletions(-) rename autobuild/{filetype.py => archive_utils.py} (53%) diff --git a/autobuild/filetype.py b/autobuild/archive_utils.py similarity index 53% rename from autobuild/filetype.py rename to autobuild/archive_utils.py index 914b3f6..9c52ba4 100644 --- a/autobuild/filetype.py +++ b/autobuild/archive_utils.py @@ -1,4 +1,6 @@ -"""Utilities for detecting file types""" +import multiprocessing +import tarfile +import zipfile class ArchiveType: GZ = "gz" @@ -47,3 +49,39 @@ def detect_archive_type(filename: str): if f_type: return f_type return _archive_type_from_signature(filename) + + +def open_archive(filename: str) -> tarfile.TarFile | zipfile.ZipFile: + f_type = detect_archive_type(filename) + + if f_type == ArchiveType.ZST: + return ZstdTarFile(filename, "r") + + if f_type == ArchiveType.ZIP: + return zipfile.ZipFile(filename, "r") + + return tarfile.open(filename, "r") + + +class ZstdTarFile(tarfile.TarFile): + def __init__(self, name, mode='r', *, level=4, zstd_dict=None, **kwargs): + from pyzstd import CParameter, ZstdFile + zstdoption = None + if mode != 'r' and mode != 'rb': + zstdoption = {CParameter.compressionLevel : level, + CParameter.nbWorkers : multiprocessing.cpu_count(), + CParameter.checksumFlag : 1} + self.zstd_file = ZstdFile(name, mode, + level_or_option=zstdoption, + zstd_dict=zstd_dict) + try: + super().__init__(fileobj=self.zstd_file, mode=mode, **kwargs) + except: + self.zstd_file.close() + raise + + def close(self): + try: + super().close() + finally: + self.zstd_file.close() diff --git a/autobuild/autobuild_tool_install.py b/autobuild/autobuild_tool_install.py index a35b7b3..47f307a 100644 --- a/autobuild/autobuild_tool_install.py +++ b/autobuild/autobuild_tool_install.py @@ -14,16 +14,14 @@ import os import pprint import sys -import tarfile import urllib.error import urllib.parse import urllib.request -import zipfile from autobuild import autobuild_base, common, configfile from autobuild.autobuild_tool_source_environment import get_enriched_environment from autobuild.hash_algorithms import verify_hash -from autobuild import filetype +from autobuild import archive_utils logger = logging.getLogger('autobuild.install') @@ -405,7 +403,7 @@ def _install_binary(configured_name, platform, package, config_file, install_dir def get_metadata_from_package(package_file) -> configfile.MetadataDescription: try: - with open_archive(package_file) as archive: + with archive_utils.open_archive(package_file) as archive: f = archive.extractfile(configfile.PACKAGE_METADATA_FILE) return configfile.MetadataDescription(stream=f) except (FileNotFoundError, KeyError): @@ -439,18 +437,6 @@ def _default_metadata_for_package(package_file: str, package = None): return metadata -def open_archive(filename: str) -> tarfile.TarFile | zipfile.ZipFile: - f_type = filetype.detect_archive_type(filename) - - if f_type == filetype.ArchiveType.ZST: - return common.ZstdTarFile(filename, "r") - - if f_type == filetype.ArchiveType.ZIP: - return zipfile.ZipFile(filename, "r") - - return tarfile.open(filename, "r") - - class ExtractPackageResults: files: list[str] conflicts: list[str] @@ -468,7 +454,7 @@ def raise_conflicts(self): def extract_package(package_file: str, install_dir: str, dry_run: bool = False) -> ExtractPackageResults: - with open_archive(package_file) as archive: + with archive_utils.open_archive(package_file) as archive: results = ExtractPackageResults() for t in archive: if t.name == configfile.PACKAGE_METADATA_FILE: diff --git a/autobuild/autobuild_tool_package.py b/autobuild/autobuild_tool_package.py index de168ff..4bbcadc 100644 --- a/autobuild/autobuild_tool_package.py +++ b/autobuild/autobuild_tool_package.py @@ -22,7 +22,6 @@ import getpass import glob -import hashlib import json import logging import os @@ -32,7 +31,7 @@ from collections import UserDict from zipfile import ZIP_DEFLATED, ZipFile -from autobuild import autobuild_base, common, configfile +from autobuild import autobuild_base, common, configfile, archive_utils from autobuild.common import AutobuildError logger = logging.getLogger('autobuild.package') @@ -306,7 +305,7 @@ def _create_tarfile(tarfilename, format, build_directory, filelist, results: dic tfile = tarfile.open(tarfilename, 'w:gz') elif format == 'tzst': tarfilename = tarfilename + '.tar.zst' - tfile = common.ZstdTarFile(tarfilename, 'w', level=22) + tfile = archive_utils.ZstdTarFile(tarfilename, 'w', level=22) else: raise PackageError("unknown tar archive format: %s" % format) diff --git a/autobuild/common.py b/autobuild/common.py index 19449d7..be48d3d 100644 --- a/autobuild/common.py +++ b/autobuild/common.py @@ -524,27 +524,3 @@ def has_cmd(name, subcmd: str = "help") -> bool: except OSError: return False return not p.returncode - - -class ZstdTarFile(tarfile.TarFile): - def __init__(self, name, mode='r', *, level=4, zstd_dict=None, **kwargs): - from pyzstd import CParameter, ZstdFile - zstdoption = None - if mode != 'r' and mode != 'rb': - zstdoption = {CParameter.compressionLevel : level, - CParameter.nbWorkers : multiprocessing.cpu_count(), - CParameter.checksumFlag : 1} - self.zstd_file = ZstdFile(name, mode, - level_or_option=zstdoption, - zstd_dict=zstd_dict) - try: - super().__init__(fileobj=self.zstd_file, mode=mode, **kwargs) - except: - self.zstd_file.close() - raise - - def close(self): - try: - super().close() - finally: - self.zstd_file.close() diff --git a/tests/test_filetype.py b/tests/test_filetype.py index 3d4360d..8fd6fde 100644 --- a/tests/test_filetype.py +++ b/tests/test_filetype.py @@ -4,22 +4,22 @@ from tests.basetest import temp_dir import pytest -from autobuild import filetype +from autobuild import archive_utils _DATA_DIR = Path(__file__).parent / "data" _ARCHIVE_TEST_CASES = ( - (path.join(_DATA_DIR, "archive.tar.bz2"), filetype.ArchiveType.BZ2), - (path.join(_DATA_DIR, "archive.tar.gz"), filetype.ArchiveType.GZ), - (path.join(_DATA_DIR, "archive.tar.zst"), filetype.ArchiveType.ZST), - (path.join(_DATA_DIR, "archive.zip"), filetype.ArchiveType.ZIP), + (path.join(_DATA_DIR, "archive.tar.bz2"), archive_utils.ArchiveType.BZ2), + (path.join(_DATA_DIR, "archive.tar.gz"), archive_utils.ArchiveType.GZ), + (path.join(_DATA_DIR, "archive.tar.zst"), archive_utils.ArchiveType.ZST), + (path.join(_DATA_DIR, "archive.zip"), archive_utils.ArchiveType.ZIP), ) @pytest.mark.parametrize("filename,expected_type", _ARCHIVE_TEST_CASES) def test_detect_from_extension(filename, expected_type): - f_type = filetype.detect_archive_type(filename) + f_type = archive_utils.detect_archive_type(filename) assert f_type == expected_type @@ -28,5 +28,5 @@ def test_detect_from_signature(filename, expected_type): with temp_dir() as dir: filename_no_ext = str(Path(dir) / "archive") shutil.copyfile(filename, filename_no_ext) - f_type = filetype.detect_archive_type(filename_no_ext) + f_type = archive_utils.detect_archive_type(filename_no_ext) assert f_type == expected_type diff --git a/tests/test_package.py b/tests/test_package.py index 027da83..cef055a 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -9,7 +9,7 @@ from zipfile import ZipFile import autobuild.autobuild_tool_package as package -from autobuild import common, configfile +from autobuild import common, configfile, archive_utils from tests.basetest import BaseTest, CaptureStdout, ExpectError, clean_dir, clean_file # **************************************************************************** @@ -76,7 +76,7 @@ def tearDown(self): def tar_has_expected(self,tar): if 'tar.zst' in tar: - tarball = common.ZstdTarFile(tar, 'r') + tarball = archive_utils.ZstdTarFile(tar, 'r') else: tarball = tarfile.open(tar, 'r') packaged_files=tarball.getnames()