diff --git a/docs/source/api_doc/index/make.rst b/docs/source/api_doc/index/make.rst index e9888d4fb6..2a230b0f4d 100644 --- a/docs/source/api_doc/index/make.rst +++ b/docs/source/api_doc/index/make.rst @@ -13,6 +13,13 @@ tar_create_index +tar_create_index_for_directory +---------------------------------------------- + +.. autofunction:: tar_create_index_for_directory + + + hf_tar_create_index ---------------------------------------------- diff --git a/hfutils/index/__init__.py b/hfutils/index/__init__.py index c01f5c74af..55fa3a5fec 100644 --- a/hfutils/index/__init__.py +++ b/hfutils/index/__init__.py @@ -1,5 +1,6 @@ from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \ ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, tar_list_files -from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory +from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory, \ + tar_create_index_for_directory from .validate import hf_tar_item_validate, hf_tar_validate diff --git a/hfutils/index/make.py b/hfutils/index/make.py index 41338e1352..c1d61bb297 100644 --- a/hfutils/index/make.py +++ b/hfutils/index/make.py @@ -1,3 +1,22 @@ +""" +This module provides functionalities for handling and indexing TAR archive files, especially for use with +the Hugging Face ecosystem. It includes functions to create and retrieve index information of TAR archives, +which is crucial for efficient data retrieval and management in large datasets. The module also integrates +with Hugging Face's repository system, allowing for operations like uploading and downloading TAR files +and their indices. + +Key functionalities include: + +- Extracting index information from TAR files. +- Creating index files for TAR archives locally or in a directory. +- Integrating with Hugging Face repositories to manage TAR archives and their indices. + +The module utilizes cryptographic hash functions for data integrity checks and supports operations on both local +and remote repositories. It is designed to work seamlessly with the Hugging Face platform, enabling users to +handle large datasets efficiently. +""" + +import glob import json import logging import os @@ -47,7 +66,7 @@ def tar_get_index_info(src_tar_file, chunk_for_hash: int = 1 << 20, with_hash: b logging.info(f'Indexing tar file {src_tar_file!r} ...') files = {} with tarfile.open(src_tar_file, mode='r|') as tar: - for tarinfo in tqdm(tar, desc='Indexing tar file ...', silent=silent): + for tarinfo in tqdm(tar, desc=f'Indexing tar file {src_tar_file!r} ...', silent=silent): tarinfo: tarfile.TarInfo if tarinfo.isreg(): info = { @@ -90,11 +109,50 @@ def tar_create_index(src_tar_file, dst_index_file: Optional[str] = None, """ body, _ = os.path.splitext(src_tar_file) dst_index_file = dst_index_file or f'{body}.json' + if os.path.dirname(dst_index_file): + os.makedirs(os.path.dirname(dst_index_file), exist_ok=True) with open(dst_index_file, 'w') as f: json.dump(tar_get_index_info(src_tar_file, chunk_for_hash, with_hash, silent), f) return dst_index_file +def tar_create_index_for_directory(src_tar_directory: str, dst_index_directory: Optional[str] = None, + chunk_for_hash: int = 1 << 20, with_hash: bool = True, silent: bool = False): + """ + Create index files for all tar archives in a specified directory. + + This function scans through the given directory to find all tar files, generates an index for each, + and saves these indices to the specified destination directory. If no destination directory is provided, + indices are saved in the same directory as the tar files. + + :param src_tar_directory: The path to the directory containing tar files. + :type src_tar_directory: str + :param dst_index_directory: The path to the directory where index files will be saved, defaults to the same as src_tar_directory. + :type dst_index_directory: str, optional + :param chunk_for_hash: The chunk size for hashing, defaults to 1 << 20 (1 MB). + :type chunk_for_hash: int, optional + :param with_hash: Whether to include file hashes in the index, defaults to True. + :type with_hash: bool, optional + :param silent: Whether to suppress progress bars and logging messages, defaults to False. + :type silent: bool, optional + :return: The path to the directory where index files are saved. + :rtype: str + """ + dst_index_directory = dst_index_directory or src_tar_directory + for tar_file in tqdm(glob.glob(os.path.join(src_tar_directory, '**', '*.tar'), recursive=True), silent=silent): + p_idx_file = os.path.join(dst_index_directory, os.path.relpath(tar_file, src_tar_directory)) + idx_body, _ = os.path.splitext(p_idx_file) + idx_file = f'{idx_body}.json' + tar_create_index( + src_tar_file=tar_file, + dst_index_file=idx_file, + chunk_for_hash=chunk_for_hash, + with_hash=with_hash, + silent=silent, + ) + return dst_index_directory + + def hf_tar_create_index(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, diff --git a/test/index/test_make.py b/test/index/test_make.py index 341c3304f3..298f104763 100644 --- a/test/index/test_make.py +++ b/test/index/test_make.py @@ -1,10 +1,11 @@ +import glob import json import os.path import pytest from hbutils.testing import isolated_directory -from hfutils.index import tar_get_index_info, tar_create_index +from hfutils.index import tar_get_index_info, tar_create_index, tar_create_index_for_directory from test.testings import get_testfile @@ -93,6 +94,80 @@ def test_tar_create_index_subdir(self, raw_tar): 'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a' } + def test_tar_create_index_for_directory(self, raw_tar): + with isolated_directory({ + os.path.join('subdir', 'raw.tar'): raw_tar, + os.path.join('subdir', '1', 'raw.tar'): raw_tar, + os.path.join('raw.tar'): raw_tar, + }): + tar_create_index_for_directory('.') + idx_data = { + 'files': { + '1.txt': { + 'offset': 3584, + 'sha256': '57a67d463dde06dcf3bf3bd8382ebf5c8d6e0a854135914e215f09fc0e1080b9', + 'size': 13 + }, + 'README.md': { + 'offset': 1536, + 'sha256': '75fae9f83087725e606ed7bf243a6655b1ddf583919529b3291980322b62af77', + 'size': 51 + }, + 'subdir/script.py': { + 'offset': 5632, + 'sha256': '5c3086e72529e59e42002f11bbfabc40b084981daedb1a3d4a31623122fd8867', + 'size': 33 + } + }, + 'filesize': 10240, + 'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1', + 'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a' + } + assert len(glob.glob(os.path.join('**', '*.json'), recursive=True)) == 3 + with open(os.path.join('raw.json'), 'r') as f: + assert json.load(f) == idx_data + with open(os.path.join('subdir', 'raw.json'), 'r') as f: + assert json.load(f) == idx_data + with open(os.path.join('subdir', '1', 'raw.json'), 'r') as f: + assert json.load(f) == idx_data + + def test_tar_create_index_for_directory(self, raw_tar): + with isolated_directory({ + os.path.join('subdir', 'raw.tar'): raw_tar, + os.path.join('subdir', '1', 'raw.tar'): raw_tar, + os.path.join('raw.tar'): raw_tar, + }): + tar_create_index_for_directory('.', 'idx_dir') + idx_data = { + 'files': { + '1.txt': { + 'offset': 3584, + 'sha256': '57a67d463dde06dcf3bf3bd8382ebf5c8d6e0a854135914e215f09fc0e1080b9', + 'size': 13 + }, + 'README.md': { + 'offset': 1536, + 'sha256': '75fae9f83087725e606ed7bf243a6655b1ddf583919529b3291980322b62af77', + 'size': 51 + }, + 'subdir/script.py': { + 'offset': 5632, + 'sha256': '5c3086e72529e59e42002f11bbfabc40b084981daedb1a3d4a31623122fd8867', + 'size': 33 + } + }, + 'filesize': 10240, + 'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1', + 'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a' + } + assert len(glob.glob(os.path.join('idx_dir', '**', '*.json'), recursive=True)) == 3 + with open(os.path.join('idx_dir', 'raw.json'), 'r') as f: + assert json.load(f) == idx_data + with open(os.path.join('idx_dir', 'subdir', 'raw.json'), 'r') as f: + assert json.load(f) == idx_data + with open(os.path.join('idx_dir', 'subdir', '1', 'raw.json'), 'r') as f: + assert json.load(f) == idx_data + def test_tar_create_index_subdir_no_hash(self, raw_tar): with isolated_directory({os.path.join('subdir', 'raw.tar'): raw_tar}): tar_create_index(os.path.join('subdir', 'raw.tar'), with_hash=False)