diff --git a/docs/source/api_doc/index/index.rst b/docs/source/api_doc/index/index.rst index b1fb93080b..bb273e1394 100644 --- a/docs/source/api_doc/index/index.rst +++ b/docs/source/api_doc/index/index.rst @@ -10,6 +10,7 @@ hfutils.index :maxdepth: 3 fetch + local_fetch make validate diff --git a/docs/source/api_doc/index/local_fetch.rst b/docs/source/api_doc/index/local_fetch.rst new file mode 100644 index 0000000000..208dd71a52 --- /dev/null +++ b/docs/source/api_doc/index/local_fetch.rst @@ -0,0 +1,51 @@ +hfutils.index.local_fetch +================================ + +.. currentmodule:: hfutils.index.local_fetch + +.. automodule:: hfutils.index.local_fetch + + + +tar_get_index +---------------------------------------------- + +.. autofunction:: tar_get_index + + + +tar_list_files +---------------------------------------------- + +.. autofunction:: tar_list_files + + + +tar_file_exists +---------------------------------------------- + +.. autofunction:: tar_file_exists + + + +tar_file_size +---------------------------------------------- + +.. autofunction:: tar_file_size + + + +tar_file_info +---------------------------------------------- + +.. autofunction:: tar_file_info + + + +tar_file_download +---------------------------------------------- + +.. autofunction:: tar_file_download + + + diff --git a/hfutils/index/__init__.py b/hfutils/index/__init__.py index 7631e21a3e..c01f5c74af 100644 --- a/hfutils/index/__init__.py +++ b/hfutils/index/__init__.py @@ -1,4 +1,5 @@ from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \ ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info +from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, tar_list_files from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory from .validate import hf_tar_item_validate, hf_tar_validate diff --git a/hfutils/index/fetch.py b/hfutils/index/fetch.py index a33f95f3d8..f15b3f1a31 100644 --- a/hfutils/index/fetch.py +++ b/hfutils/index/fetch.py @@ -1,6 +1,6 @@ import json import os.path -from typing import Optional, Dict, Union +from typing import Optional, Dict, Union, List from huggingface_hub.file_download import http_get, hf_hub_url from huggingface_hub.utils import build_hf_headers @@ -96,7 +96,7 @@ def hf_tar_list_files(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, - hf_token: Optional[str] = None): + hf_token: Optional[str] = None) -> List[str]: """ List files inside a tar archive file in a Hugging Face repository. diff --git a/hfutils/index/local_fetch.py b/hfutils/index/local_fetch.py new file mode 100644 index 0000000000..4d86fbb5b2 --- /dev/null +++ b/hfutils/index/local_fetch.py @@ -0,0 +1,242 @@ +""" +This module provides utility functions for working with tar archives and their associated index files. +It includes functions for retrieving archive indexes, listing files, checking file existence, +getting file information, and downloading files from archives. + +The module relies on a JSON-based index file that contains metadata about the files within the archive, +including their offsets, sizes, and optional SHA256 hashes. + +Functions in this module are designed to work with both local archive files and their corresponding +index files, providing a convenient interface for archive manipulation and file extraction. +""" + +import json +import os +from typing import Optional, List + + +def tar_get_index(archive_file: str, idx_file: Optional[str] = None): + """ + Retrieve the index data for a given tar archive file. + + This function reads the JSON index file associated with the archive, + which contains metadata about the files within the archive. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: The parsed JSON data from the index file. + :rtype: dict + + :raises FileNotFoundError: If the index file is not found. + :raises json.JSONDecodeError: If the index file is not valid JSON. + + :example: + >>> index_data = tar_get_index('my_archive.tar') + """ + body, _ = os.path.splitext(archive_file) + default_index_file = f'{body}.json' + with open(idx_file or default_index_file, 'r') as f: + return json.load(f) + + +def tar_list_files(archive_file: str, idx_file: Optional[str] = None) -> List[str]: + """ + List all files contained within the specified tar archive. + + This function uses the archive's index file to retrieve the list of files + without actually reading the tar archive itself. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: A list of file names contained in the archive. + :rtype: List[str] + + :example: + >>> files = tar_list_files('my_archive.tar') + >>> for file in files: + >>> print(file) + """ + index_data = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + return list(index_data['files'].keys()) + + +def tar_file_exists(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> bool: + """ + Check if a specific file exists within the tar archive. + + This function uses the archive's index to check for file existence + without reading the entire archive. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to check for in the archive. + :type file_in_archive: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: True if the file exists in the archive, False otherwise. + :rtype: bool + + :example: + >>> exists = tar_file_exists('my_archive.tar', 'path/to/file.txt') + >>> if exists: + >>> print("File exists in the archive") + """ + from .fetch import _hf_files_process, _n_path + index = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + files = _hf_files_process(index['files']) + return _n_path(file_in_archive) in files + + +def tar_file_info(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> dict: + """ + Retrieve information about a specific file within the tar archive. + + This function returns a dictionary containing metadata about the specified file, + such as its size and offset within the archive. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to get information for. + :type file_in_archive: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: A dictionary containing file metadata. + :rtype: dict + + :raises FileNotFoundError: If the specified file is not found in the archive. + + :example: + >>> info = tar_file_info('my_archive.tar', 'path/to/file.txt') + >>> print(f"File size: {info['size']} bytes") + """ + from .fetch import _hf_files_process, _n_path + index = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + files = _hf_files_process(index['files']) + if _n_path(file_in_archive) not in files: + raise FileNotFoundError(f'File {file_in_archive!r} not found ' + f'in local archive {archive_file!r}.') + else: + return files[_n_path(file_in_archive)] + + +def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> int: + """ + Get the size of a specific file within the tar archive. + + This function returns the size of the specified file in bytes. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to get the size for. + :type file_in_archive: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: The size of the file in bytes. + :rtype: int + + :raises FileNotFoundError: If the specified file is not found in the archive. + + :example: + >>> size = tar_file_size('my_archive.tar', 'path/to/file.txt') + >>> print(f"File size: {size} bytes") + """ + return tar_file_info( + archive_file=archive_file, + file_in_archive=file_in_archive, + idx_file=idx_file, + )['size'] + + +def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, + idx_file: Optional[str] = None, chunk_size: int = 1 << 20): + """ + Extract and download a specific file from the tar archive to a local file. + + This function reads the specified file from the archive and writes it to a local file. + It also performs integrity checks to ensure the downloaded file is complete and matches + the expected hash (if provided in the index). + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to extract from the archive. + :type file_in_archive: str + :param local_file: The path where the extracted file should be saved. + :type local_file: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + :param chunk_size: The size of chunks to read and write, in bytes. Default is 1MB. + :type chunk_size: int + + :raises FileNotFoundError: If the specified file is not found in the archive. + :raises ArchiveStandaloneFileIncompleteDownload: If the downloaded file size doesn't match the expected size. + :raises ArchiveStandaloneFileHashNotMatch: If the SHA256 hash of the downloaded file doesn't match the expected hash. + + :example: + >>> tar_file_download('my_archive.tar', 'path/to/file.txt', 'local_file.txt') + """ + from .fetch import _hf_files_process, _n_path, _f_sha256, \ + ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch + + index = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + files = _hf_files_process(index['files']) + if _n_path(file_in_archive) not in files: + raise FileNotFoundError(f'File {file_in_archive!r} not found ' + f'in local archive {archive_file!r}.') + + info = files[_n_path(file_in_archive)] + + if os.path.dirname(local_file): + os.makedirs(os.path.dirname(local_file), exist_ok=True) + try: + with open(local_file, 'wb') as wf: + if info['size'] > 0: + with open(archive_file, 'rb') as rf: + rf.seek(info['offset']) + tp = info['offset'] + info['size'] + while rf.tell() < tp: + read_bytes = min(tp - rf.tell(), chunk_size) + wf.write(rf.read(read_bytes)) + + if os.path.getsize(local_file) != info['size']: + raise ArchiveStandaloneFileIncompleteDownload( + f'Expected size is {info["size"]}, but actually {os.path.getsize(local_file)} downloaded.' + ) + + if info.get('sha256'): + _sha256 = _f_sha256(local_file) + if _sha256 != info['sha256']: + raise ArchiveStandaloneFileHashNotMatch( + f'Expected hash is {info["sha256"]!r}, but actually {_sha256!r} found.' + ) + + except Exception: + if os.path.exists(local_file): + os.remove(local_file) + raise diff --git a/test/index/conftest.py b/test/index/conftest.py new file mode 100644 index 0000000000..ee7265679d --- /dev/null +++ b/test/index/conftest.py @@ -0,0 +1,15 @@ +import pytest +from hbutils.system import TemporaryDirectory + +from hfutils.operate import download_directory_as_directory + + +@pytest.fixture(scope='module') +def local_narugo_test_cos5t_tars(): + with TemporaryDirectory() as td: + download_directory_as_directory( + repo_id='narugo/test_cos5t_tars', + repo_type='dataset', + local_directory=td, + ) + yield td diff --git a/test/index/test_local_fetch.py b/test/index/test_local_fetch.py new file mode 100644 index 0000000000..92f12a07d4 --- /dev/null +++ b/test/index/test_local_fetch.py @@ -0,0 +1,136 @@ +import os.path + +import pytest +from hbutils.testing import isolated_directory +from natsort import natsorted + +from hfutils.index import tar_list_files, tar_file_exists, tar_file_download, tar_file_info, \ + tar_file_size +from test.testings import get_testfile, file_compare + + +@pytest.mark.unittest +class TestIndexLocalFetch: + def test_tar_list_files(self, local_narugo_test_cos5t_tars): + files = tar_list_files( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + ) + assert len(files) == 17 + assert natsorted(files) == [ + '.meta.json', 'Bright_Voyager.png', 'Grail_League_1星.png', 'Grail_League_2星.png', 'Grail_League_3星.png', + 'Grail_League_4星.png', 'Grail_League_5星.png', '奥特瑙斯.png', '奥特瑙斯_改建型.png', '常夏的泳装.png', + '常夏的泳装Ver_02.png', '愚人节.png', '愚人节_奥特瑙斯.png', '第1阶段.png', '第2阶段.png', '第3阶段.png', + '第4阶段.png' + ] + + def test_tar_file_exists(self, local_narugo_test_cos5t_tars): + assert tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json' + ) + assert tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节_奥特瑙斯.png' + ) + assert tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png' + ) + assert not tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节奥特瑙斯.png' + ) + + def test_tar_file_info(self, local_narugo_test_cos5t_tars): + assert tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json' + ) == { + 'offset': 2725376, + 'sha256': '4585b01c251a496b73cb231d29fc711cfb1d682a84334d95f6f5b6c1cc5b5222', + 'size': 8968 + } + assert tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节_奥特瑙斯.png' + ) == { + 'offset': 3954176, + 'sha256': '991497fa586f6f4529827e0f8f1f228c20ec9fb507c314ee9d20d47c46f26e89', + 'size': 255276 + } + assert tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png' + ) == { + 'offset': 3954176, + 'sha256': '991497fa586f6f4529827e0f8f1f228c20ec9fb507c314ee9d20d47c46f26e89', + 'size': 255276 + } + with pytest.raises(FileNotFoundError): + _ = tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节奥特瑙斯.png' + ) + + def test_tar_file_size(self, local_narugo_test_cos5t_tars): + assert tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json' + ) == 8968 + assert tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节_奥特瑙斯.png' + ) == 255276 + assert tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png' + ) == 255276 + with pytest.raises(FileNotFoundError): + _ = tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节奥特瑙斯.png' + ) + + def test_tar_file_download_small(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json', + local_file='.meta.json' + ) + file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + + def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png' + ) + file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + + def test_tar_file_download_not_found(self, local_narugo_test_cos5t_tars): + with isolated_directory(), pytest.raises(FileNotFoundError): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png' + ) + + def test_tar_file_download_subdir(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'ex3.tar'), + file_in_archive='artoria_caster_third_ascension_fate/sankaku_21305298.jpg', + local_file='f/ac.jpg' + ) + file_compare(get_testfile('sankaku_21305298.jpg'), 'f/ac.jpg') + + def test_tar_file_download_empty(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'empty_file.tar'), + file_in_archive='empty_file', + local_file='empty_file', + ) + assert os.path.getsize('empty_file') == 0