Skip to content

Commit

Permalink
Merge pull request #43 from deepghs/dev/download
Browse files Browse the repository at this point in the history
dev(narugo): add lazy download mode
  • Loading branch information
narugo1992 authored Sep 1, 2024
2 parents c6786f4 + 9ee1475 commit 76d9879
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 2 deletions.
13 changes: 12 additions & 1 deletion hfutils/index/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
proxies: Optional[Dict] = None, user_agent: Union[Dict, str, None] = None,
headers: Optional[Dict[str, str]] = None, endpoint: Optional[str] = None,
hf_token: Optional[str] = None):
force_download: bool = False, hf_token: Optional[str] = None):
"""
Download a file from a tar archive file in a Hugging Face repository.
Expand Down Expand Up @@ -470,6 +470,10 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st
:type headers: Dict[str, str], optional
:param endpoint: The Hugging Face API endpoint.
:type endpoint: str, optional
:param force_download: Force download the file to destination path.
Defualt to `False`, downloading will be skipped if the local file
is fully matched with expected file.
:type force_download: bool
:param hf_token: The Hugging Face access token.
:type hf_token: str, optional
:raises FileNotFoundError: Raise this when file not exist in tar archive.
Expand Down Expand Up @@ -533,6 +537,13 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st
end_bytes = info['offset'] + info['size'] - 1
headers['Range'] = f'bytes={start_bytes}-{end_bytes}'

if not force_download and os.path.exists(local_file) and \
os.path.isfile(local_file) and os.path.getsize(local_file) == info['size']:
_expected_sha256 = info.get('sha256')
if not _expected_sha256 or _f_sha256(local_file) == _expected_sha256:
# file already ready, no need to download it again
return

if os.path.dirname(local_file):
os.makedirs(os.path.dirname(local_file), exist_ok=True)
try:
Expand Down
13 changes: 12 additions & 1 deletion hfutils/index/local_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[st


def tar_file_download(archive_file: str, file_in_archive: str, local_file: str,
idx_file: Optional[str] = None, chunk_size: int = 1 << 20):
idx_file: Optional[str] = None, chunk_size: int = 1 << 20, force_download: bool = False):
"""
Extract and download a specific file from the tar archive to a local file.
Expand All @@ -190,6 +190,10 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str,
:type idx_file: Optional[str]
:param chunk_size: The size of chunks to read and write, in bytes. Default is 1MB.
:type chunk_size: int
:param force_download: Force download the file to destination path.
Defualt to `False`, downloading will be skipped if the local file
is fully matched with expected file.
:type force_download: bool
:raises FileNotFoundError: If the specified file is not found in the archive.
:raises ArchiveStandaloneFileIncompleteDownload: If the downloaded file size doesn't match the expected size.
Expand All @@ -212,6 +216,13 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str,

info = files[_n_path(file_in_archive)]

if not force_download and os.path.exists(local_file) and \
os.path.isfile(local_file) and os.path.getsize(local_file) == info['size']:
_expected_sha256 = info.get('sha256')
if not _expected_sha256 or _f_sha256(local_file) == _expected_sha256:
# file already ready, no need to download it again
return

if os.path.dirname(local_file):
os.makedirs(os.path.dirname(local_file), exist_ok=True)
try:
Expand Down
48 changes: 48 additions & 0 deletions test/index/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,30 @@ def test_hf_tar_file_download_small(self):
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_hf_tar_file_download_small_exist(self):
with isolated_directory({
'.meta.json': get_testfile('skin_mashu', '.meta.json')
}):
hf_tar_file_download(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
file_in_archive='.meta.json',
local_file='.meta.json'
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_hf_tar_file_download_small_replace(self):
with isolated_directory({
'.meta.json': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png')
}):
hf_tar_file_download(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
file_in_archive='.meta.json',
local_file='.meta.json'
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_hf_tar_file_download_lfs(self):
with isolated_directory():
hf_tar_file_download(
Expand All @@ -124,6 +148,30 @@ def test_hf_tar_file_download_lfs(self):
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_hf_tar_file_download_lfs_exist(self):
with isolated_directory({
'愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png')
}):
hf_tar_file_download(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
file_in_archive='./愚人节_奥特瑙斯.png',
local_file='愚人节_奥特瑙斯.png'
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_hf_tar_file_download_lfs_replace(self):
with isolated_directory({
'愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '.meta.json')
}):
hf_tar_file_download(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
file_in_archive='./愚人节_奥特瑙斯.png',
local_file='愚人节_奥特瑙斯.png'
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_hf_tar_file_download_not_found(self):
with isolated_directory(), pytest.raises(FileNotFoundError):
hf_tar_file_download(
Expand Down
44 changes: 44 additions & 0 deletions test/index/test_local_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,28 @@ def test_tar_file_download_small(self, local_narugo_test_cos5t_tars):
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_tar_file_download_small_exist(self, local_narugo_test_cos5t_tars):
with isolated_directory({
'.meta.json': get_testfile('skin_mashu', '.meta.json')
}):
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='.meta.json',
local_file='.meta.json'
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_tar_file_download_small_replace(self, local_narugo_test_cos5t_tars):
with isolated_directory({
'.meta.json': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png')
}):
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='.meta.json',
local_file='.meta.json'
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars):
with isolated_directory():
tar_file_download(
Expand All @@ -109,6 +131,28 @@ def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars):
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_tar_file_download_lfs_exist(self, local_narugo_test_cos5t_tars):
with isolated_directory({
'愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'),
}):
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节_奥特瑙斯.png',
local_file='愚人节_奥特瑙斯.png',
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_tar_file_download_lfs_replace(self, local_narugo_test_cos5t_tars):
with isolated_directory({
'愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '.meta.json'),
}):
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节_奥特瑙斯.png',
local_file='愚人节_奥特瑙斯.png',
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_tar_file_download_not_found(self, local_narugo_test_cos5t_tars):
with isolated_directory(), pytest.raises(FileNotFoundError):
tar_file_download(
Expand Down

0 comments on commit 76d9879

Please sign in to comment.