Skip to content

Commit

Permalink
dev(narugo): add idx for local directory
Browse files Browse the repository at this point in the history
  • Loading branch information
narugo1992 committed Aug 11, 2024
1 parent c6bfef1 commit 23c67bf
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 3 deletions.
7 changes: 7 additions & 0 deletions docs/source/api_doc/index/make.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ tar_create_index



tar_create_index_for_directory
----------------------------------------------

.. autofunction:: tar_create_index_for_directory



hf_tar_create_index
----------------------------------------------

Expand Down
3 changes: 2 additions & 1 deletion hfutils/index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info
from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, tar_list_files
from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory
from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory, \
tar_create_index_for_directory
from .validate import hf_tar_item_validate, hf_tar_validate
60 changes: 59 additions & 1 deletion hfutils/index/make.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
"""
This module provides functionalities for handling and indexing TAR archive files, especially for use with
the Hugging Face ecosystem. It includes functions to create and retrieve index information of TAR archives,
which is crucial for efficient data retrieval and management in large datasets. The module also integrates
with Hugging Face's repository system, allowing for operations like uploading and downloading TAR files
and their indices.
Key functionalities include:
- Extracting index information from TAR files.
- Creating index files for TAR archives locally or in a directory.
- Integrating with Hugging Face repositories to manage TAR archives and their indices.
The module utilizes cryptographic hash functions for data integrity checks and supports operations on both local
and remote repositories. It is designed to work seamlessly with the Hugging Face platform, enabling users to
handle large datasets efficiently.
"""

import glob
import json
import logging
import os
Expand Down Expand Up @@ -47,7 +66,7 @@ def tar_get_index_info(src_tar_file, chunk_for_hash: int = 1 << 20, with_hash: b
logging.info(f'Indexing tar file {src_tar_file!r} ...')
files = {}
with tarfile.open(src_tar_file, mode='r|') as tar:
for tarinfo in tqdm(tar, desc='Indexing tar file ...', silent=silent):
for tarinfo in tqdm(tar, desc=f'Indexing tar file {src_tar_file!r} ...', silent=silent):
tarinfo: tarfile.TarInfo
if tarinfo.isreg():
info = {
Expand Down Expand Up @@ -90,11 +109,50 @@ def tar_create_index(src_tar_file, dst_index_file: Optional[str] = None,
"""
body, _ = os.path.splitext(src_tar_file)
dst_index_file = dst_index_file or f'{body}.json'
if os.path.dirname(dst_index_file):
os.makedirs(os.path.dirname(dst_index_file), exist_ok=True)
with open(dst_index_file, 'w') as f:
json.dump(tar_get_index_info(src_tar_file, chunk_for_hash, with_hash, silent), f)
return dst_index_file


def tar_create_index_for_directory(src_tar_directory: str, dst_index_directory: Optional[str] = None,
chunk_for_hash: int = 1 << 20, with_hash: bool = True, silent: bool = False):
"""
Create index files for all tar archives in a specified directory.
This function scans through the given directory to find all tar files, generates an index for each,
and saves these indices to the specified destination directory. If no destination directory is provided,
indices are saved in the same directory as the tar files.
:param src_tar_directory: The path to the directory containing tar files.
:type src_tar_directory: str
:param dst_index_directory: The path to the directory where index files will be saved, defaults to the same as src_tar_directory.
:type dst_index_directory: str, optional
:param chunk_for_hash: The chunk size for hashing, defaults to 1 << 20 (1 MB).
:type chunk_for_hash: int, optional
:param with_hash: Whether to include file hashes in the index, defaults to True.
:type with_hash: bool, optional
:param silent: Whether to suppress progress bars and logging messages, defaults to False.
:type silent: bool, optional
:return: The path to the directory where index files are saved.
:rtype: str
"""
dst_index_directory = dst_index_directory or src_tar_directory
for tar_file in tqdm(glob.glob(os.path.join(src_tar_directory, '**', '*.tar'), recursive=True), silent=silent):
p_idx_file = os.path.join(dst_index_directory, os.path.relpath(tar_file, src_tar_directory))
idx_body, _ = os.path.splitext(p_idx_file)
idx_file = f'{idx_body}.json'
tar_create_index(
src_tar_file=tar_file,
dst_index_file=idx_file,
chunk_for_hash=chunk_for_hash,
with_hash=with_hash,
silent=silent,
)
return dst_index_directory


def hf_tar_create_index(repo_id: str, archive_in_repo: str,
repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
Expand Down
77 changes: 76 additions & 1 deletion test/index/test_make.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import glob
import json
import os.path

import pytest
from hbutils.testing import isolated_directory

from hfutils.index import tar_get_index_info, tar_create_index
from hfutils.index import tar_get_index_info, tar_create_index, tar_create_index_for_directory
from test.testings import get_testfile


Expand Down Expand Up @@ -93,6 +94,80 @@ def test_tar_create_index_subdir(self, raw_tar):
'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a'
}

def test_tar_create_index_for_directory(self, raw_tar):
with isolated_directory({
os.path.join('subdir', 'raw.tar'): raw_tar,
os.path.join('subdir', '1', 'raw.tar'): raw_tar,
os.path.join('raw.tar'): raw_tar,
}):
tar_create_index_for_directory('.')
idx_data = {
'files': {
'1.txt': {
'offset': 3584,
'sha256': '57a67d463dde06dcf3bf3bd8382ebf5c8d6e0a854135914e215f09fc0e1080b9',
'size': 13
},
'README.md': {
'offset': 1536,
'sha256': '75fae9f83087725e606ed7bf243a6655b1ddf583919529b3291980322b62af77',
'size': 51
},
'subdir/script.py': {
'offset': 5632,
'sha256': '5c3086e72529e59e42002f11bbfabc40b084981daedb1a3d4a31623122fd8867',
'size': 33
}
},
'filesize': 10240,
'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1',
'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a'
}
assert len(glob.glob(os.path.join('**', '*.json'), recursive=True)) == 3
with open(os.path.join('raw.json'), 'r') as f:
assert json.load(f) == idx_data
with open(os.path.join('subdir', 'raw.json'), 'r') as f:
assert json.load(f) == idx_data
with open(os.path.join('subdir', '1', 'raw.json'), 'r') as f:
assert json.load(f) == idx_data

def test_tar_create_index_for_directory(self, raw_tar):
with isolated_directory({
os.path.join('subdir', 'raw.tar'): raw_tar,
os.path.join('subdir', '1', 'raw.tar'): raw_tar,
os.path.join('raw.tar'): raw_tar,
}):
tar_create_index_for_directory('.', 'idx_dir')
idx_data = {
'files': {
'1.txt': {
'offset': 3584,
'sha256': '57a67d463dde06dcf3bf3bd8382ebf5c8d6e0a854135914e215f09fc0e1080b9',
'size': 13
},
'README.md': {
'offset': 1536,
'sha256': '75fae9f83087725e606ed7bf243a6655b1ddf583919529b3291980322b62af77',
'size': 51
},
'subdir/script.py': {
'offset': 5632,
'sha256': '5c3086e72529e59e42002f11bbfabc40b084981daedb1a3d4a31623122fd8867',
'size': 33
}
},
'filesize': 10240,
'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1',
'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a'
}
assert len(glob.glob(os.path.join('idx_dir', '**', '*.json'), recursive=True)) == 3
with open(os.path.join('idx_dir', 'raw.json'), 'r') as f:
assert json.load(f) == idx_data
with open(os.path.join('idx_dir', 'subdir', 'raw.json'), 'r') as f:
assert json.load(f) == idx_data
with open(os.path.join('idx_dir', 'subdir', '1', 'raw.json'), 'r') as f:
assert json.load(f) == idx_data

def test_tar_create_index_subdir_no_hash(self, raw_tar):
with isolated_directory({os.path.join('subdir', 'raw.tar'): raw_tar}):
tar_create_index(os.path.join('subdir', 'raw.tar'), with_hash=False)
Expand Down

0 comments on commit 23c67bf

Please sign in to comment.