Skip to content

Commit

Permalink
[HfApi] prevent empty commits when copying files (#2730)
Browse files Browse the repository at this point in the history
* detect no-op operations when copying files

* Add comments

* fixes post-review
  • Loading branch information
hanouticelina authored Jan 14, 2025
1 parent 70f3a7a commit b2c9a14
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 12 deletions.
35 changes: 32 additions & 3 deletions src/huggingface_hub/_commit_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ class CommitOperationCopy:
src_path_in_repo: str
path_in_repo: str
src_revision: Optional[str] = None
# set to the OID of the file to be copied if it has already been uploaded
# useful to determine if a commit will be empty or not.
_src_oid: Optional[str] = None
# set to the OID of the file to copy to if it has already been uploaded
# useful to determine if a commit will be empty or not.
_dest_oid: Optional[str] = None

def __post_init__(self):
self.src_path_in_repo = _validate_path_in_repo(self.src_path_in_repo)
Expand Down Expand Up @@ -595,19 +601,38 @@ def _fetch_files_to_copy(

hf_api = HfApi(endpoint=endpoint, headers=headers)
files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
# Store (path, revision) -> oid mapping
oid_info: Dict[Tuple[str, Optional[str]], Optional[str]] = {}
# 1. Fetch OIDs for destination paths in batches.
dest_paths = [op.path_in_repo for op in copies]
for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
dest_repo_files = hf_api.get_paths_info(
repo_id=repo_id,
paths=dest_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
revision=revision,
repo_type=repo_type,
)
for file in dest_repo_files:
if not isinstance(file, RepoFolder):
oid_info[(file.path, revision)] = file.blob_id

# 2. Group by source revision and fetch source file info in batches.
for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
operations = list(operations) # type: ignore
paths = [op.src_path_in_repo for op in operations]
for offset in range(0, len(paths), FETCH_LFS_BATCH_SIZE):
src_paths = [op.src_path_in_repo for op in operations]
for offset in range(0, len(src_paths), FETCH_LFS_BATCH_SIZE):
src_repo_files = hf_api.get_paths_info(
repo_id=repo_id,
paths=paths[offset : offset + FETCH_LFS_BATCH_SIZE],
paths=src_paths[offset : offset + FETCH_LFS_BATCH_SIZE],
revision=src_revision or revision,
repo_type=repo_type,
)

for src_repo_file in src_repo_files:
if isinstance(src_repo_file, RepoFolder):
raise NotImplementedError("Copying a folder is not implemented.")
oid_info[(src_repo_file.path, src_revision)] = src_repo_file.blob_id
# If it's an LFS file, store the RepoFile object. Otherwise, download raw bytes.
if src_repo_file.lfs:
files_to_copy[(src_repo_file.path, src_revision)] = src_repo_file
else:
Expand All @@ -622,12 +647,16 @@ def _fetch_files_to_copy(
response = get_session().get(url, headers=headers)
hf_raise_for_status(response)
files_to_copy[(src_repo_file.path, src_revision)] = response.content
# 3. Ensure all operations found a corresponding file in the Hub
# and track src/dest OIDs for each operation.
for operation in operations:
if (operation.src_path_in_repo, src_revision) not in files_to_copy:
raise EntryNotFoundError(
f"Cannot copy {operation.src_path_in_repo} at revision "
f"{src_revision or revision}: file is missing on repo."
)
operation._src_oid = oid_info.get((operation.src_path_in_repo, operation.src_revision))
operation._dest_oid = oid_info.get((operation.path_in_repo, revision))
return files_to_copy


Expand Down
26 changes: 18 additions & 8 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4019,6 +4019,14 @@ def create_commit(
free_memory=False, # do not remove `CommitOperationAdd.path_or_fileobj` on LFS files for "normal" users
)

files_to_copy = _fetch_files_to_copy(
copies=copies,
repo_type=repo_type,
repo_id=repo_id,
headers=headers,
revision=unquoted_revision,
endpoint=self.endpoint,
)
# Remove no-op operations (files that have not changed)
operations_without_no_op = []
for operation in operations:
Expand All @@ -4030,6 +4038,16 @@ def create_commit(
# File already exists on the Hub and has not changed: we can skip it.
logger.debug(f"Skipping upload for '{operation.path_in_repo}' as the file has not changed.")
continue
if (
isinstance(operation, CommitOperationCopy)
and operation._dest_oid is not None
and operation._dest_oid == operation._src_oid
):
# Source and destination files are identical - skip
logger.debug(
f"Skipping copy for '{operation.src_path_in_repo}' -> '{operation.path_in_repo}' as the content of the source file is the same as the destination file."
)
continue
operations_without_no_op.append(operation)
if len(operations) != len(operations_without_no_op):
logger.info(
Expand Down Expand Up @@ -4058,14 +4076,6 @@ def create_commit(
oid=info.sha, # type: ignore[arg-type]
)

files_to_copy = _fetch_files_to_copy(
copies=copies,
repo_type=repo_type,
repo_id=repo_id,
headers=headers,
revision=unquoted_revision,
endpoint=self.endpoint,
)
commit_payload = _prepare_commit_payload(
operations=operations,
files_to_copy=files_to_copy,
Expand Down
112 changes: 111 additions & 1 deletion tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,37 @@ def test_prevent_empty_commit_if_no_new_addition(self, repo_url: RepoUrl) -> Non
)
assert logs.records[1].levelname == "WARNING"

@use_tmp_repo()
def test_prevent_empty_commit_if_no_new_copy(self, repo_url: RepoUrl) -> None:
# Add 2 regular identical files and 2 LFS identical files
self._api.create_commit(
repo_id=repo_url.repo_id,
commit_message="initial commit",
operations=[
CommitOperationAdd(path_or_fileobj=b"Regular file content", path_in_repo="file.txt"),
CommitOperationAdd(path_or_fileobj=b"Regular file content", path_in_repo="file_copy.txt"),
CommitOperationAdd(path_or_fileobj=b"LFS content", path_in_repo="lfs.bin"),
CommitOperationAdd(path_or_fileobj=b"LFS content", path_in_repo="lfs_copy.bin"),
],
)
with self.assertLogs("huggingface_hub", level="INFO") as logs:
self._api.create_commit(
repo_id=repo_url.repo_id,
commit_message="Empty commit",
operations=[
CommitOperationCopy(src_path_in_repo="file.txt", path_in_repo="file_copy.txt"),
CommitOperationCopy(src_path_in_repo="lfs.bin", path_in_repo="lfs_copy.bin"),
],
)
assert logs.records[0].message == "Removing 2 file(s) from commit that have not changed."
assert logs.records[0].levelname == "INFO"

assert (
logs.records[1].message
== "No files have been modified since last commit. Skipping to prevent empty commit."
)
assert logs.records[1].levelname == "WARNING"

@use_tmp_repo()
def test_empty_commit_on_pr(self, repo_url: RepoUrl) -> None:
"""
Expand Down Expand Up @@ -1191,13 +1222,86 @@ def test_continue_commit_without_existing_files(self, repo_url: RepoUrl) -> None
assert paths_info["lfs2.bin"].title == "second commit"
assert paths_info["lfs3.bin"].title == "second commit"

@use_tmp_repo()
def test_continue_commit_if_copy_is_identical(self, repo_url: RepoUrl) -> None:
self._api.create_commit(
repo_id=repo_url.repo_id,
commit_message="initial commit",
operations=[
CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"),
CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file_copy.txt"),
CommitOperationAdd(path_or_fileobj=b"content 2.0", path_in_repo="file2.txt"),
CommitOperationAdd(path_or_fileobj=b"LFS content 1.0", path_in_repo="lfs.bin"),
CommitOperationAdd(path_or_fileobj=b"LFS content 1.0", path_in_repo="lfs_copy.bin"),
CommitOperationAdd(path_or_fileobj=b"LFS content 2.0", path_in_repo="lfs2.bin"),
],
)
with self.assertLogs("huggingface_hub", level="DEBUG") as logs:
self._api.create_commit(
repo_id=repo_url.repo_id,
commit_message="second commit",
operations=[
# Did not change => will be removed from commit
CommitOperationCopy(src_path_in_repo="file.txt", path_in_repo="file_copy.txt"),
# Change => will be kept
CommitOperationCopy(src_path_in_repo="file2.txt", path_in_repo="file.txt"),
# New file => will be kept
CommitOperationCopy(src_path_in_repo="file2.txt", path_in_repo="file3.txt"),
# Did not change => will be removed from commit
CommitOperationCopy(src_path_in_repo="lfs.bin", path_in_repo="lfs_copy.bin"),
# Change => will be kept
CommitOperationCopy(src_path_in_repo="lfs2.bin", path_in_repo="lfs.bin"),
# New file => will be kept
CommitOperationCopy(src_path_in_repo="lfs2.bin", path_in_repo="lfs3.bin"),
],
)
debug_logs = [log.message for log in logs.records if log.levelname == "DEBUG"]
info_logs = [log.message for log in logs.records if log.levelname == "INFO"]
warning_logs = [log.message for log in logs.records if log.levelname == "WARNING"]

assert (
"Skipping copy for 'file.txt' -> 'file_copy.txt' as the content of the source file is the same as the destination file."
in debug_logs
)
assert (
"Skipping copy for 'lfs.bin' -> 'lfs_copy.bin' as the content of the source file is the same as the destination file."
in debug_logs
)
assert "Removing 2 file(s) from commit that have not changed." in info_logs
assert len(warning_logs) == 0 # no warnings since the commit is not empty

paths_info = {
item.path: item.last_commit
for item in self._api.get_paths_info(
repo_id=repo_url.repo_id,
paths=[
"file.txt",
"file_copy.txt",
"file3.txt",
"lfs.bin",
"lfs_copy.bin",
"lfs3.bin",
],
expand=True,
)
}

# Check which files are in the last commit
assert paths_info["file.txt"].title == "second commit"
assert paths_info["file_copy.txt"].title == "initial commit"
assert paths_info["file3.txt"].title == "second commit"
assert paths_info["lfs.bin"].title == "second commit"
assert paths_info["lfs_copy.bin"].title == "initial commit"
assert paths_info["lfs3.bin"].title == "second commit"

@use_tmp_repo()
def test_continue_commit_if_only_deletion(self, repo_url: RepoUrl) -> None:
self._api.create_commit(
repo_id=repo_url.repo_id,
commit_message="initial commit",
operations=[
CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"),
CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file_copy.txt"),
CommitOperationAdd(path_or_fileobj=b"content 2.0", path_in_repo="file2.txt"),
],
)
Expand All @@ -1208,6 +1312,8 @@ def test_continue_commit_if_only_deletion(self, repo_url: RepoUrl) -> None:
operations=[
# Did not change => will be removed from commit
CommitOperationAdd(path_or_fileobj=b"content 1.0", path_in_repo="file.txt"),
# identical to file.txt => will be removed from commit
CommitOperationCopy(src_path_in_repo="file.txt", path_in_repo="file_copy.txt"),
# Delete operation => kept in any case
CommitOperationDelete(path_in_repo="file2.txt"),
],
Expand All @@ -1217,7 +1323,11 @@ def test_continue_commit_if_only_deletion(self, repo_url: RepoUrl) -> None:
warning_logs = [log.message for log in logs.records if log.levelname == "WARNING"]

assert "Skipping upload for 'file.txt' as the file has not changed." in debug_logs
assert "Removing 1 file(s) from commit that have not changed." in info_logs
assert (
"Skipping copy for 'file.txt' -> 'file_copy.txt' as the content of the source file is the same as the destination file."
in debug_logs
)
assert "Removing 2 file(s) from commit that have not changed." in info_logs
assert len(warning_logs) == 0 # no warnings since the commit is not empty

remote_files = self._api.list_repo_files(repo_id=repo_url.repo_id)
Expand Down

0 comments on commit b2c9a14

Please sign in to comment.