Skip to content

Commit

Permalink
dev(narugo): add docs for tar indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
narugo1992 committed Apr 20, 2024
1 parent a68ebea commit dc33b1b
Show file tree
Hide file tree
Showing 7 changed files with 320 additions and 2 deletions.
51 changes: 51 additions & 0 deletions docs/source/api_doc/index/fetch.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
hfutils.index.fetch
================================

.. currentmodule:: hfutils.index.fetch

.. automodule:: hfutils.index.fetch


ArchiveStandaloneFileIncompleteDownload
----------------------------------------------

.. autoclass:: ArchiveStandaloneFileIncompleteDownload



ArchiveStandaloneFileHashNotMatch
----------------------------------------------

.. autoclass:: ArchiveStandaloneFileHashNotMatch



hf_tar_list_files
----------------------------------------------

.. autofunction:: hf_tar_list_files



hf_tar_file_download
----------------------------------------------

.. autofunction:: hf_tar_file_download



hf_tar_get_index
----------------------------------------------

.. autofunction:: hf_tar_get_index



hf_tar_file_exists
----------------------------------------------

.. autofunction:: hf_tar_file_exists




14 changes: 14 additions & 0 deletions docs/source/api_doc/index/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
hfutils.index
================================

.. currentmodule:: hfutils.index

.. automodule:: hfutils.index


.. toctree::
:maxdepth: 3

fetch
make

36 changes: 36 additions & 0 deletions docs/source/api_doc/index/make.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
hfutils.index.make
================================

.. currentmodule:: hfutils.index.make

.. automodule:: hfutils.index.make


tar_create_index
----------------------------------------------

.. autofunction:: tar_create_index



hf_tar_create_index
----------------------------------------------

.. autofunction:: hf_tar_create_index



tar_get_index_info
----------------------------------------------

.. autofunction:: tar_get_index_info



hf_tar_create_from_directory
----------------------------------------------

.. autofunction:: hf_tar_create_from_directory



1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Overview
api_doc/archive/index
api_doc/config/index
api_doc/entry/index
api_doc/index/index
api_doc/operate/index
api_doc/utils/index

Expand Down
132 changes: 130 additions & 2 deletions hfutils/index/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,46 @@


class ArchiveStandaloneFileIncompleteDownload(Exception):
pass
"""
Exception raised when a standalone file in an archive is incompletely downloaded.
"""


class ArchiveStandaloneFileHashNotMatch(Exception):
pass
"""
Exception raised when the hash of a standalone file in an archive does not match.
"""


def hf_tar_get_index(repo_id: str, archive_in_repo: str,
repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
hf_token: Optional[str] = None):
"""
Get the index of a tar archive file in a Hugging Face repository.
:param repo_id: The identifier of the repository.
:type repo_id: str
:param archive_in_repo: The path to the archive file in the repository.
:type archive_in_repo: str
:param repo_type: The type of the Hugging Face repository.
:type repo_type: RepoTypeTyping, optional
:param revision: The revision of the repository.
:type revision: str, optional
:param idx_repo_id: The identifier of the index repository.
:type idx_repo_id: str, optional
:param idx_file_in_repo: The path to the index file in the index repository.
:type idx_file_in_repo: str, optional
:param idx_repo_type: The type of the index repository.
:type idx_repo_type: RepoTypeTyping, optional
:param idx_revision: The revision of the index repository.
:type idx_revision: str, optional
:param hf_token: The Hugging Face access token.
:type hf_token: str, optional
:return: The index of the tar archive file.
:rtype: Dict
"""
hf_client = get_hf_client(hf_token)
body, _ = os.path.splitext(archive_in_repo)
default_index_file = f'{body}.json'
Expand All @@ -39,6 +67,30 @@ def hf_tar_list_files(repo_id: str, archive_in_repo: str,
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
hf_token: Optional[str] = None):
"""
List files inside a tar archive file in a Hugging Face repository.
:param repo_id: The identifier of the repository.
:type repo_id: str
:param archive_in_repo: The path to the archive file in the repository.
:type archive_in_repo: str
:param repo_type: The type of the Hugging Face repository.
:type repo_type: RepoTypeTyping, optional
:param revision: The revision of the repository.
:type revision: str, optional
:param idx_repo_id: The identifier of the index repository.
:type idx_repo_id: str, optional
:param idx_file_in_repo: The path to the index file in the index repository.
:type idx_file_in_repo: str, optional
:param idx_repo_type: The type of the index repository.
:type idx_repo_type: RepoTypeTyping, optional
:param idx_revision: The revision of the index repository.
:type idx_revision: str, optional
:param hf_token: The Hugging Face access token.
:type hf_token: str, optional
:return: The list of files inside the tar archive.
:rtype: List[str]
"""
index_data = hf_tar_get_index(
repo_id=repo_id,
archive_in_repo=archive_in_repo,
Expand All @@ -61,6 +113,32 @@ def hf_tar_file_exists(repo_id: str, archive_in_repo: str, file_in_archive: str,
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
hf_token: Optional[str] = None):
"""
Check if a file exists inside a tar archive file in a Hugging Face repository.
:param repo_id: The identifier of the repository.
:type repo_id: str
:param archive_in_repo: The path to the archive file in the repository.
:type archive_in_repo: str
:param file_in_archive: The path to the file inside the archive.
:type file_in_archive: str
:param repo_type: The type of the Hugging Face repository.
:type repo_type: RepoTypeTyping, optional
:param revision: The revision of the repository.
:type revision: str, optional
:param idx_repo_id: The identifier of the index repository.
:type idx_repo_id: str, optional
:param idx_file_in_repo: The path to the index file in the index repository.
:type idx_file_in_repo: str, optional
:param idx_repo_type: The type of the index repository.
:type idx_repo_type: RepoTypeTyping, optional
:param idx_revision: The revision of the index repository.
:type idx_revision: str, optional
:param hf_token: The Hugging Face access token.
:type hf_token: str, optional
:return: True if the file exists, False otherwise.
:rtype: bool
"""
index = hf_tar_get_index(
repo_id=repo_id,
archive_in_repo=archive_in_repo,
Expand All @@ -79,10 +157,26 @@ def hf_tar_file_exists(repo_id: str, archive_in_repo: str, file_in_archive: str,


def _n_path(path):
"""
Normalize a file path.
:param path: The file path to normalize.
:type path: str
:return: The normalized file path.
:rtype: str
"""
return os.path.normpath(os.path.join('/', path))


def _hf_files_process(files: Dict[str, dict]):
"""
Normalize file paths in a dictionary of files.
:param files: The dictionary of files.
:type files: Dict[str, dict]
:return: The dictionary of files with normalized paths.
:rtype: Dict[str, dict]
"""
return {_n_path(key): value for key, value in files.items()}


Expand All @@ -93,6 +187,40 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st
proxies: Optional[Dict] = None, user_agent: Union[Dict, str, None] = None,
headers: Optional[Dict[str, str]] = None, endpoint: Optional[str] = None,
hf_token: Optional[str] = None):
"""
Download a file from a tar archive file in a Hugging Face repository.
:param repo_id: The identifier of the repository.
:type repo_id: str
:param archive_in_repo: The path to the archive file in the repository.
:type archive_in_repo: str
:param file_in_archive: The path to the file inside the archive.
:type file_in_archive: str
:param local_file: The path to save the downloaded file locally.
:type local_file: str
:param repo_type: The type of the Hugging Face repository.
:type repo_type: RepoTypeTyping, optional
:param revision: The revision of the repository.
:type revision: str, optional
:param idx_repo_id: The identifier of the index repository.
:type idx_repo_id: str, optional
:param idx_file_in_repo: The path to the index file in the index repository.
:type idx_file_in_repo: str, optional
:param idx_repo_type: The type of the index repository.
:type idx_repo_type: RepoTypeTyping, optional
:param idx_revision: The revision of the index repository.
:type idx_revision: str, optional
:param proxies: The proxies to be used for the HTTP request.
:type proxies: Dict, optional
:param user_agent: The user agent for the HTTP request.
:type user_agent: Union[Dict, str, None], optional
:param headers: The additional headers for the HTTP request.
:type headers: Dict[str, str], optional
:param endpoint: The Hugging Face API endpoint.
:type endpoint: str, optional
:param hf_token: The Hugging Face access token.
:type hf_token: str, optional
"""
index = hf_tar_get_index(
repo_id=repo_id,
archive_in_repo=archive_in_repo,
Expand Down
10 changes: 10 additions & 0 deletions hfutils/index/hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@


def _f_sha256(file, chunk_for_hash: int = 1 << 20):
"""
Calculate the SHA-256 hash of a file.
:param file: The path to the file.
:type file: str
:param chunk_for_hash: The chunk size for hashing, defaults to 1 << 20 (1 MB).
:type chunk_for_hash: int, optional
:return: The SHA-256 hash of the file.
:rtype: str
"""
file_sha = sha256()
with open(file, 'rb') as f:
while True:
Expand Down
Loading

0 comments on commit dc33b1b

Please sign in to comment.