-
-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)
Adds `filename` to pages, pointed to the WACZ file those files come from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent migration to backfill this information for existing pages, and increases the backend container's startupProbe time to 24 hours to give it sufficient time to finish the migration. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
- Loading branch information
Showing
8 changed files
with
131 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 50 additions & 0 deletions
50
backend/btrixcloud/migrations/migration_0042_page_filenames.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
""" | ||
Migration 0042 - Add filename to pages | ||
""" | ||
|
||
from btrixcloud.migrations import BaseMigration | ||
|
||
|
||
MIGRATION_VERSION = "0042" | ||
|
||
|
||
class Migration(BaseMigration): | ||
"""Migration class.""" | ||
|
||
# pylint: disable=unused-argument | ||
def __init__(self, mdb, **kwargs): | ||
super().__init__(mdb, migration_version=MIGRATION_VERSION) | ||
|
||
self.page_ops = kwargs.get("page_ops") | ||
|
||
async def migrate_up(self): | ||
"""Perform migration up. | ||
Add filename to all pages that don't currently have it stored, | ||
iterating through each archived item and its WACZ files as necessary | ||
""" | ||
pages_mdb = self.mdb["pages"] | ||
|
||
if self.page_ops is None: | ||
print( | ||
"Unable to add filename and other fields to pages, missing page_ops", | ||
flush=True, | ||
) | ||
return | ||
|
||
crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None}) | ||
|
||
crawl_count = len(crawl_ids_to_update) | ||
current_index = 1 | ||
|
||
for crawl_id in crawl_ids_to_update: | ||
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True) | ||
try: | ||
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id) | ||
# pylint: disable=broad-exception-caught | ||
except Exception as err: | ||
print( | ||
f"Error adding filename and other fields to pages in item {crawl_id}: {err}", | ||
flush=True, | ||
) | ||
current_index += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters