diff --git a/cronjobs/src/commands/_git_export_git_tools.py b/cronjobs/src/commands/_git_export_git_tools.py index 986aabb9..a95b060a 100644 --- a/cronjobs/src/commands/_git_export_git_tools.py +++ b/cronjobs/src/commands/_git_export_git_tools.py @@ -32,7 +32,8 @@ def clone_or_fetch( if not repo.raw_listall_references(): print("No branches or tags found in the repository.") else: - print("Head was at", repo.head.target) + if not repo.head_is_unborn: + print("Head was at", repo.head.target) print(f"Fetching from {repo_url}...") remote.fetch(callbacks=callbacks, prune=True) else: diff --git a/cronjobs/src/commands/git_export.py b/cronjobs/src/commands/git_export.py index 8c378183..c194198d 100644 --- a/cronjobs/src/commands/git_export.py +++ b/cronjobs/src/commands/git_export.py @@ -132,7 +132,8 @@ def git_export(): if not repo.raw_listall_references(): print("No branches or tags found in the repository.") else: - print("Head is now at", repo.head.target) + if not repo.head_is_unborn: + print("Head is now at", repo.head.target) try: changed_attachments, changed_branches, created_tags = asyncio.run( @@ -506,7 +507,9 @@ def process_attachments( return changed_attachments, common_content -def changeset_to_branch_folder(changeset: dict[str, Any]) -> list[tuple[str, bytes]]: +def changeset_to_branch_folder( + branch_tree: pygit2.Tree | None, changeset: dict[str, Any] +) -> list[tuple[str, bytes]]: """ Convert a changeset to a list of files to be stored in the corresponding branch folder. """ @@ -516,6 +519,15 @@ def changeset_to_branch_folder(changeset: dict[str, Any]) -> list[tuple[str, byt records = sorted(changeset["changes"], key=lambda r: r["id"]) for record in records: branch_content.append((f"{cid}/{record['id']}.json", json_dumpb(record))) + + # Delete any records that were removed in this changeset. + # (branch_tree is None on first run, and `cid` folder may not exist yet) + if branch_tree is not None and cid in branch_tree: + for entry in branch_tree[cid]: + basename = entry.name.rsplit(".json", 1)[0] + if basename != "metadata" and basename not in {r["id"] for r in records}: + branch_content.append((f"{cid}/{entry.name}", None)) + return branch_content @@ -536,7 +548,9 @@ def initialize_bucket_branches( for bid, bucket_changesets in changesets_by_bucket.items(): branch_content: list[tuple[str, bytes]] = [] for changeset in bucket_changesets: - branch_content += changeset_to_branch_folder(changeset) + branch_content += changeset_to_branch_folder( + branch_tree=None, changeset=changeset + ) # Bucket branch does not exist yet, create it as an empty branch. empty_tree_id = repo.TreeBuilder().write() @@ -605,7 +619,7 @@ def update_bucket_branches( dtcollection = ts2dt(timestamp).isoformat() commit_message = f"{bid}/{cid}@{timestamp} ({dtcollection})" - branch_content = changeset_to_branch_folder(changeset) + branch_content = changeset_to_branch_folder(branch_tree, changeset) files_tree_id = tree_upsert_blobs( repo, branch_content, base_tree=branch_tree ) diff --git a/cronjobs/tests/commands/test_git_export.py b/cronjobs/tests/commands/test_git_export.py index 7055ea5f..a21800f3 100644 --- a/cronjobs/tests/commands/test_git_export.py +++ b/cronjobs/tests/commands/test_git_export.py @@ -205,10 +205,16 @@ def mock_rs_server_content(): ) -def read_file(repo, branch, filepath): - ref = f"refs/heads/{branch}" - branch_ref = repo.lookup_reference(ref) - commit = repo[branch_ref.target] +def read_file(repo, ref_or_branch_name, filepath): + if not ref_or_branch_name.startswith("refs/"): + ref_name = f"refs/heads/{ref_or_branch_name}" + else: + ref_name = ref_or_branch_name + ref = repo.lookup_reference(ref_name) + commit = repo[ref.target] + # If it's a tag, peel to commit + if commit.type == pygit2.GIT_OBJECT_TAG: + commit = commit.peel(pygit2.GIT_OBJECT_COMMIT) node = commit.tree for part in filepath.split("/"): entry = node[part] @@ -588,6 +594,65 @@ def test_repo_sync_stores_collections_records_in_buckets_branches_with_tags( assert '"attachment":{' in rid2.decode() +@responses.activate +def test_repo_sync_deletes_records_from_past_runs( + repo, + mock_git_fetch, + mock_ls_remotes, + mock_rs_server_content, + mock_github_lfs, + mock_git_push, +): + git_export.git_export() + simulate_pushed(repo, mock_ls_remotes) + + # File exists before next run (not raising). + read_file( + repo, "refs/tags/v1/timestamps/bid2/cid2/1600000000000", "cid2/rid2-1.json" + ) + + # Now simulate that cid2 deleted its record. + responses.replace( + responses.GET, + "http://testserver:9999/v1/buckets/monitor/collections/changes/changeset", + json={ + "timestamp": 1800000000000, + "changes": [ + { + "last_modified": 1800000000000, + "bucket": "bid2", + "collection": "cid2", + } + ], + }, + ) + responses.add( + responses.GET, + "http://testserver:9999/v1/buckets/bid2/collections/cid2/changeset", + json={ + "timestamp": 1800000000000, + "metadata": { + "bucket": "bid2", + "id": "cid2", + "signature": { + "x5u": "https://autograph.example.com/keys/123", + }, + "last_modified": 1888888888000, + }, + # Record was deleted (we don't use `_since`, so no tombstone) + "changes": [], + }, + ) + + git_export.git_export() + + # File not there anymore. + with pytest.raises(KeyError): + read_file( + repo, "refs/tags/v1/timestamps/bid2/cid2/1800000000000", "cid2/rid2-1.json" + ) + + @responses.activate def test_repo_sync_stores_attachments_as_lfs_pointers( repo,