From e8be7414a66c25a8443252a45448bdc478458689 Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Wed, 14 Jan 2026 19:51:57 +0100 Subject: [PATCH 1/3] Fix display of referenced attachments --- bin/check-workspace-attachments.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/check-workspace-attachments.py b/bin/check-workspace-attachments.py index 196a0cf6..27692090 100644 --- a/bin/check-workspace-attachments.py +++ b/bin/check-workspace-attachments.py @@ -166,14 +166,14 @@ async def main(): print(f"{len(marked_for_deletion)} attachments are marked for deletion in GCS.") # Now check which of the only_workspace_attachments are marked for deletion. - to_postpone_deletion = [] + to_postpone_deletion = set() for live_attachments in ( all_workspace_attachments, all_preview_attachments, all_main_attachments, ): if marked := marked_for_deletion & live_attachments: - to_postpone_deletion.extend(marked) + to_postpone_deletion.update(marked) if to_postpone_deletion: print( @@ -182,11 +182,13 @@ async def main(): with storage_client.batch(): for blob_name in to_postpone_deletion: blob = bucket.blob(blob_name) + blob.reload() # Once set, custom_time cannot be removed. We set it to date in the future. blob.custom_time = datetime.datetime.now( datetime.timezone.utc ) + datetime.timedelta(days=POSTPONE_DELETION_MARK_DAYS) blob.patch() + blob.reload() print( f"Postponed deletion mark of gs://{STORAGE_BUCKET_NAME}/{blob.name} to {blob.custom_time}" ) From 4b0f2a04d8097cbf7c6dcb5b4289f62116a89927 Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Tue, 20 Jan 2026 13:02:55 +0100 Subject: [PATCH 2/3] Rewrite objects completely, from scratch --- bin/check-workspace-attachments.py | 57 ++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/bin/check-workspace-attachments.py b/bin/check-workspace-attachments.py index 27692090..50e3df13 100644 --- a/bin/check-workspace-attachments.py +++ b/bin/check-workspace-attachments.py @@ -7,9 +7,9 @@ """ import asyncio -import datetime import itertools import os +import tempfile import kinto_http import requests @@ -56,6 +56,37 @@ async def fetch(url): return [url for url, success in zip(urls, results) if not success] +def rewrite_from_scratch(bucket, blob_name): + """ + Since GCS does not let us remove the `custom_time` field which + is used in the retention policy rule, we rewrite the object from + scratch. + + Note: `custom_time` is not overridable during `rewrite()`. + """ + with tempfile.NamedTemporaryFile(dir="/tmp", delete=False) as tmp: + tmp_path = tmp.name + print(f"Rewrite {blob_name} using backup at {tmp_path}", end="") + # Download + blob = bucket.blob(blob_name) + blob.download_to_filename(tmp_path) + print(".", end="") + # Delete all generations + bucket.delete_blobs(bucket.list_blobs(prefix=blob_name, versions=True)) + print(".", end="") + # Re-upload (same object name, new generation) + new_blob = bucket.blob(blob_name) + new_blob.metadata = blob.metadata + new_blob.content_type = blob.content_type + new_blob.upload_from_filename(tmp_path) + print(".", end="") + new_blob.reload() + assert new_blob.custom_time is None, ( + f"{blob_name} has custom time as {new_blob.custom_time}" + ) + print(". Done.") + + async def main(): client = kinto_http.AsyncClient(server_url=SERVER_URL, auth=AUTH) @@ -166,32 +197,22 @@ async def main(): print(f"{len(marked_for_deletion)} attachments are marked for deletion in GCS.") # Now check which of the only_workspace_attachments are marked for deletion. - to_postpone_deletion = set() + to_reset = set() for live_attachments in ( all_workspace_attachments, all_preview_attachments, all_main_attachments, ): if marked := marked_for_deletion & live_attachments: - to_postpone_deletion.update(marked) + to_reset.update(marked) - if to_postpone_deletion: + if to_reset: print( - f"⚠️ {len(to_postpone_deletion)} attachments referenced in workspace/preview/main buckets are marked for deletion in GCS." + f"⚠️ {len(to_reset)} attachments referenced in workspace/preview/main buckets are marked for deletion in GCS." ) - with storage_client.batch(): - for blob_name in to_postpone_deletion: - blob = bucket.blob(blob_name) - blob.reload() - # Once set, custom_time cannot be removed. We set it to date in the future. - blob.custom_time = datetime.datetime.now( - datetime.timezone.utc - ) + datetime.timedelta(days=POSTPONE_DELETION_MARK_DAYS) - blob.patch() - blob.reload() - print( - f"Postponed deletion mark of gs://{STORAGE_BUCKET_NAME}/{blob.name} to {blob.custom_time}" - ) + for blob_name in to_reset: + rewrite_from_scratch(bucket, blob_name) + print(f"Removed deletion mark of gs://{STORAGE_BUCKET_NAME}/{blob_name}") else: print( "✅ No attachment referenced in workspace/preview/main buckets is marked for deletion in GCS." From 218feafef88049b797ed8e5906b0c3df068e665a Mon Sep 17 00:00:00 2001 From: Mathieu Leplatre Date: Tue, 20 Jan 2026 13:15:06 +0100 Subject: [PATCH 3/3] Simplify slightly --- bin/check-workspace-attachments.py | 80 ++++++++-------------- cronjobs/src/commands/git_export.py | 4 +- cronjobs/tests/commands/test_git_export.py | 2 +- 3 files changed, 31 insertions(+), 55 deletions(-) diff --git a/bin/check-workspace-attachments.py b/bin/check-workspace-attachments.py index 50e3df13..b7eb42b3 100644 --- a/bin/check-workspace-attachments.py +++ b/bin/check-workspace-attachments.py @@ -66,13 +66,15 @@ def rewrite_from_scratch(bucket, blob_name): """ with tempfile.NamedTemporaryFile(dir="/tmp", delete=False) as tmp: tmp_path = tmp.name - print(f"Rewrite {blob_name} using backup at {tmp_path}", end="") + print(f"Rewrite gs://{STORAGE_BUCKET_NAME}/{blob_name} using backup at {tmp_path}", end=" ") # Download blob = bucket.blob(blob_name) blob.download_to_filename(tmp_path) print(".", end="") # Delete all generations - bucket.delete_blobs(bucket.list_blobs(prefix=blob_name, versions=True)) + versions = bucket.list_blobs(prefix=blob_name, versions=True) + print(".", end="") + bucket.delete_blobs(list(versions)) print(".", end="") # Re-upload (same object name, new generation) new_blob = bucket.blob(blob_name) @@ -87,6 +89,20 @@ def rewrite_from_scratch(bucket, blob_name): print(". Done.") +async def list_all_attachments(client, collections): + records = await asyncio.gather( + *(client.get_records(bucket=bid, collection=cid) for bid, cid in collections) + ) + return set( + [ + r["attachment"]["location"] + for records in records + for r in records + if "attachment" in r + ] + ) + + async def main(): client = kinto_http.AsyncClient(server_url=SERVER_URL, auth=AUTH) @@ -98,21 +114,7 @@ async def main(): all_collections = list(itertools.chain.from_iterable(results)) print(len(all_collections), "collections to analyze") - # List all attachments in workspace buckets. - all_workspace_records = await asyncio.gather( - *( - client.get_records(bucket=bid, collection=cid) - for bid, cid in all_collections - ) - ) - all_workspace_attachments = set( - [ - r["attachment"]["location"] - for records in all_workspace_records - for r in records - if "attachment" in r - ] - ) + all_workspace_attachments = await list_all_attachments(client, all_collections) print(f"Found {len(all_workspace_attachments)} draft attachments in total") # Now list all attachments in preview buckets. @@ -128,19 +130,8 @@ async def main(): for bid, cid in all_collections if cid not in without_preview_collection ] - all_preview_records = await asyncio.gather( - *( - client.get_records(bucket=bid, collection=cid) - for bid, cid in all_preview_collections - ) - ) - all_preview_attachments = set( - [ - r["attachment"]["location"] - for records in all_preview_records - for r in records - if "attachment" in r - ] + all_preview_attachments = await list_all_attachments( + client, all_preview_collections ) # Now list attachments in main buckets. @@ -149,34 +140,17 @@ async def main(): "security-state-staging": "security-state", "staging": "blocklists", } - all_main_collections = [ - (main_buckets[bid], cid) - for bid, cid in all_collections - if cid not in without_preview_collection - ] - all_main_records = await asyncio.gather( - *( - client.get_records(bucket=bid, collection=cid) - for bid, cid in all_main_collections - ) - ) - all_main_attachments = set( - [ - r["attachment"]["location"] - for records in all_main_records - for r in records - if "attachment" in r - ] - ) + all_main_collections = [(main_buckets[bid], cid) for bid, cid in all_collections] + all_main_attachments = await list_all_attachments(client, all_main_collections) + # Check which of the only_workspace_attachments are missing from the server. + # We only check these since they are not checked by our Telescope attachments checks. only_workspace_attachments = ( all_workspace_attachments - all_preview_attachments - all_main_attachments ) print( f"{len(only_workspace_attachments)} attachments are only referenced in workspace buckets" ) - - # Check which of the only_workspace_attachments are missing from the server. server_info = await client.server_info() base_url = server_info["capabilities"]["attachments"]["base_url"] missing = await check_urls( @@ -186,7 +160,7 @@ async def main(): print("\n".join(missing)) # Now check which GCS attachments are marked for deletion. And make sure that we do not - # have attachments referenced in preview and main buckets that are marked for deletion. + # have attachments referenced in buckets that are marked for deletion. print("Checking GCS for deletion marks...") storage_client = storage.Client() bucket = storage_client.bucket(STORAGE_BUCKET_NAME) @@ -203,7 +177,7 @@ async def main(): all_preview_attachments, all_main_attachments, ): - if marked := marked_for_deletion & live_attachments: + if marked := (marked_for_deletion & live_attachments): to_reset.update(marked) if to_reset: diff --git a/cronjobs/src/commands/git_export.py b/cronjobs/src/commands/git_export.py index 9d90ae9b..603e44a2 100644 --- a/cronjobs/src/commands/git_export.py +++ b/cronjobs/src/commands/git_export.py @@ -499,7 +499,9 @@ def process_attachments( if existing := existing_attachments.get(location): existing_hash, existing_size = existing if existing_hash != hash or existing_size != size: - print(f"Bundle {path} {'is new' if existing_hash is None else 'has changed'}") + print( + f"Bundle {path} {'is new' if existing_hash is None else 'has changed'}" + ) changed_attachments.append((hash, size, url)) return changed_attachments, common_content diff --git a/cronjobs/tests/commands/test_git_export.py b/cronjobs/tests/commands/test_git_export.py index 50bd0eb5..7dbfde28 100644 --- a/cronjobs/tests/commands/test_git_export.py +++ b/cronjobs/tests/commands/test_git_export.py @@ -84,7 +84,7 @@ def mock_rs_server_content(): }, "config": { "modified": "2024-01-01T00:00:00Z", - } + }, }, )