From 0e9e70f3a39b415e5726c8ed2b8c9ccc1f974bca Mon Sep 17 00:00:00 2001
From: Tessa Walsh <tessa@bitarchivist.net>
Date: Wed, 5 Feb 2025 15:50:04 -0500
Subject: [PATCH] Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)

Adds `filename` to pages, pointed to the WACZ file those files come
from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent
migration to backfill this information for existing pages, and increases
the backend container's startupProbe time to 24 hours to give it sufficient
time to finish the migration.
---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
---
 backend/btrixcloud/db.py                      |  2 +-
 .../migration_0042_page_filenames.py          | 50 +++++++++++++++++
 backend/btrixcloud/models.py                  |  4 ++
 backend/btrixcloud/pages.py                   | 53 +++++++++++++++++++
 backend/btrixcloud/storages.py                |  4 +-
 backend/test/test_run_crawl.py                | 16 ++++++
 backend/test/test_uploads.py                  |  2 +
 chart/templates/backend.yaml                  |  4 +-
 8 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 backend/btrixcloud/migrations/migration_0042_page_filenames.py

diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py
index a16964626f..9645f56307 100644
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@@ -17,7 +17,7 @@
 from .migrations import BaseMigration
 
 
-CURR_DB_VERSION = "0041"
+CURR_DB_VERSION = "0042"
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/migrations/migration_0042_page_filenames.py b/backend/btrixcloud/migrations/migration_0042_page_filenames.py
new file mode 100644
index 0000000000..5410d4b593
--- /dev/null
+++ b/backend/btrixcloud/migrations/migration_0042_page_filenames.py
@@ -0,0 +1,50 @@
+"""
+Migration 0042 - Add filename to pages
+"""
+
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0042"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    # pylint: disable=unused-argument
+    def __init__(self, mdb, **kwargs):
+        super().__init__(mdb, migration_version=MIGRATION_VERSION)
+
+        self.page_ops = kwargs.get("page_ops")
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Add filename to all pages that don't currently have it stored,
+        iterating through each archived item and its WACZ files as necessary
+        """
+        pages_mdb = self.mdb["pages"]
+
+        if self.page_ops is None:
+            print(
+                "Unable to add filename and other fields to pages, missing page_ops",
+                flush=True,
+            )
+            return
+
+        crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
+
+        crawl_count = len(crawl_ids_to_update)
+        current_index = 1
+
+        for crawl_id in crawl_ids_to_update:
+            print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
+            try:
+                await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
+                    flush=True,
+                )
+            current_index += 1
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index e0e57f200a..38734d7915 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
     loadState: Optional[int] = None
     status: Optional[int] = None
     mime: Optional[str] = None
+    filename: Optional[str] = None
+    depth: Optional[int] = None
+    favIconUrl: Optional[AnyHttpUrl] = None
+    isSeed: Optional[bool] = False
 
     # manual review
     userid: Optional[UUID] = None
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
index 4149a3e9d3..4b53b5b9b5 100644
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@@ -1,6 +1,7 @@
 """crawl pages"""
 
 import asyncio
+import os
 import traceback
 from datetime import datetime
 from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
@@ -83,6 +84,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
 
                 if len(pages_buffer) > batch_size:
                     await self._add_pages_to_db(crawl_id, pages_buffer)
+                    pages_buffer = []
 
                 pages_buffer.append(
                     self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@@ -100,6 +102,53 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
             traceback.print_exc()
             print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
 
+    async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
+        """Add WACZ filename and additional fields to existing pages in crawl if not already set"""
+        try:
+            crawl = await self.crawl_ops.get_crawl_out(crawl_id)
+            if not crawl.resources:
+                return
+
+            for wacz_file in crawl.resources:
+                # Strip oid directory from filename
+                filename = os.path.basename(wacz_file.name)
+
+                stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
+                for page_dict in stream:
+                    if not page_dict.get("url"):
+                        continue
+
+                    page_id = page_dict.get("id")
+
+                    if not page_id:
+                        continue
+
+                    if page_id:
+                        try:
+                            page_id = UUID(page_id)
+                        # pylint: disable=broad-exception-caught
+                        except Exception:
+                            continue
+
+                    await self.pages.find_one_and_update(
+                        {"_id": page_id},
+                        {
+                            "$set": {
+                                "filename": filename,
+                                "depth": page_dict.get("depth"),
+                                "isSeed": page_dict.get("seed", False),
+                                "favIconUrl": page_dict.get("favIconUrl"),
+                            }
+                        },
+                    )
+        # pylint: disable=broad-exception-caught, raise-missing-from
+        except Exception as err:
+            traceback.print_exc()
+            print(
+                f"Error adding filename to pages from item {crawl_id} to db: {err}",
+                flush=True,
+            )
+
     def _get_page_from_dict(
         self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
     ) -> Page:
@@ -127,6 +176,10 @@ def _get_page_from_dict(
             loadState=page_dict.get("loadState"),
             status=status,
             mime=page_dict.get("mime", "text/html"),
+            filename=page_dict.get("filename"),
+            depth=page_dict.get("depth"),
+            isSeed=page_dict.get("seed", False),
+            favIconUrl=page_dict.get("favIconUrl"),
             ts=(str_to_date(ts) if ts else dt_now()),
         )
         p.compute_page_type()
diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py
index e167449eb5..d03497484e 100644
--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@@ -619,7 +619,9 @@ def stream_page_lines(
 
             line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
             for line in line_iter:
-                yield _parse_json(line.decode("utf-8", errors="ignore"))
+                page_json = _parse_json(line.decode("utf-8", errors="ignore"))
+                page_json["filename"] = os.path.basename(wacz_filename)
+                yield page_json
 
         page_generators: List[Iterator[Dict[Any, Any]]] = []
 
diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
index f40f5ba8ba..511c4c6c1e 100644
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
         assert page["loadState"]
         assert page["status"]
         assert page["mime"]
+        assert page["filename"]
+        assert page["depth"] is not None
+        assert page["favIconUrl"]
+        assert page["isSeed"] in (True, False)
         assert page["isError"] in (True, False)
         assert page["isFile"] in (True, False)
 
@@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
     assert page.get("title") or page.get("title") is None
     assert page["loadState"]
     assert page["mime"]
+    assert page["filename"]
+    assert page["depth"] is not None
+    assert page["favIconUrl"]
+    assert page["isSeed"] in (True, False)
     assert page["isError"] in (True, False)
     assert page["isFile"] in (True, False)
 
@@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
     assert page.get("title") or page.get("title") is None
     assert page["loadState"]
     assert page["mime"]
+    assert page["filename"]
+    assert page["depth"] is not None
+    assert page["favIconUrl"]
+    assert page["isSeed"] in (True, False)
     assert page["isError"] in (True, False)
     assert page["isFile"] in (True, False)
 
@@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
         assert page["loadState"]
         assert page["status"]
         assert page["mime"]
+        assert page["filename"]
+        assert page["depth"] is not None
+        assert page["favIconUrl"]
+        assert page["isSeed"] in (True, False)
         assert page["isError"] in (True, False)
         assert page["isFile"] in (True, False)
 
diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
index 719a247e1c..56e5c1d978 100644
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
         assert page["crawl_id"] == upload_id
         assert page["url"]
         assert page["ts"]
+        assert page["filename"]
         assert page.get("title") or page.get("title") is None
 
     page_id = pages[0]["id"]
@@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
     assert page["crawl_id"]
     assert page["url"]
     assert page["ts"]
+    assert page["filename"]
     assert page.get("title") or page.get("title") is None
 
     assert page["notes"] == []
diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml
index 8f96fdd24f..3ce6dec50b 100644
--- a/chart/templates/backend.yaml
+++ b/chart/templates/backend.yaml
@@ -123,8 +123,8 @@ spec:
             httpGet:
               path: /healthzStartup
               port: 8000
-            periodSeconds: 5
-            failureThreshold: 60
+            periodSeconds: 10
+            failureThreshold: 8640
             successThreshold: 1
 
           readinessProbe: