Skip to content

Commit

Permalink
Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)
Browse files Browse the repository at this point in the history
Adds `filename` to pages, pointed to the WACZ file those files come
from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent
migration to backfill this information for existing pages, and increases
the backend container's startupProbe time to 24 hours to give it sufficient
time to finish the migration.
---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
  • Loading branch information
tw4l and ikreymer authored Feb 5, 2025
1 parent 8cfa287 commit 0e9e70f
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 4 deletions.
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .migrations import BaseMigration


CURR_DB_VERSION = "0041"
CURR_DB_VERSION = "0042"


# ============================================================================
Expand Down
50 changes: 50 additions & 0 deletions backend/btrixcloud/migrations/migration_0042_page_filenames.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Migration 0042 - Add filename to pages
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0042"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

self.page_ops = kwargs.get("page_ops")

async def migrate_up(self):
"""Perform migration up.
Add filename to all pages that don't currently have it stored,
iterating through each archived item and its WACZ files as necessary
"""
pages_mdb = self.mdb["pages"]

if self.page_ops is None:
print(
"Unable to add filename and other fields to pages, missing page_ops",
flush=True,
)
return

crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})

crawl_count = len(crawl_ids_to_update)
current_index = 1

for crawl_id in crawl_ids_to_update:
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
try:
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
flush=True,
)
current_index += 1
4 changes: 4 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False

# manual review
userid: Optional[UUID] = None
Expand Down
53 changes: 53 additions & 0 deletions backend/btrixcloud/pages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""crawl pages"""

import asyncio
import os
import traceback
from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
Expand Down Expand Up @@ -83,6 +84,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):

if len(pages_buffer) > batch_size:
await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer = []

pages_buffer.append(
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
Expand All @@ -100,6 +102,53 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
traceback.print_exc()
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)

async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
"""Add WACZ filename and additional fields to existing pages in crawl if not already set"""
try:
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
if not crawl.resources:
return

for wacz_file in crawl.resources:
# Strip oid directory from filename
filename = os.path.basename(wacz_file.name)

stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
for page_dict in stream:
if not page_dict.get("url"):
continue

page_id = page_dict.get("id")

if not page_id:
continue

if page_id:
try:
page_id = UUID(page_id)
# pylint: disable=broad-exception-caught
except Exception:
continue

await self.pages.find_one_and_update(
{"_id": page_id},
{
"$set": {
"filename": filename,
"depth": page_dict.get("depth"),
"isSeed": page_dict.get("seed", False),
"favIconUrl": page_dict.get("favIconUrl"),
}
},
)
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err:
traceback.print_exc()
print(
f"Error adding filename to pages from item {crawl_id} to db: {err}",
flush=True,
)

def _get_page_from_dict(
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
) -> Page:
Expand Down Expand Up @@ -127,6 +176,10 @@ def _get_page_from_dict(
loadState=page_dict.get("loadState"),
status=status,
mime=page_dict.get("mime", "text/html"),
filename=page_dict.get("filename"),
depth=page_dict.get("depth"),
isSeed=page_dict.get("seed", False),
favIconUrl=page_dict.get("favIconUrl"),
ts=(str_to_date(ts) if ts else dt_now()),
)
p.compute_page_type()
Expand Down
4 changes: 3 additions & 1 deletion backend/btrixcloud/storages.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,7 +619,9 @@ def stream_page_lines(

line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
for line in line_iter:
yield _parse_json(line.decode("utf-8", errors="ignore"))
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
page_json["filename"] = os.path.basename(wacz_filename)
yield page_json

page_generators: List[Iterator[Dict[Any, Any]]] = []

Expand Down
16 changes: 16 additions & 0 deletions backend/test/test_run_crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand All @@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand Down Expand Up @@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand Down Expand Up @@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

Expand Down
2 changes: 2 additions & 0 deletions backend/test/test_uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["crawl_id"] == upload_id
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None

page_id = pages[0]["id"]
Expand All @@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None

assert page["notes"] == []
Expand Down
4 changes: 2 additions & 2 deletions chart/templates/backend.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ spec:
httpGet:
path: /healthzStartup
port: 8000
periodSeconds: 5
failureThreshold: 60
periodSeconds: 10
failureThreshold: 8640
successThreshold: 1

readinessProbe:
Expand Down

0 comments on commit 0e9e70f

Please sign in to comment.