From baa629cfb0feb8d49c5bb20a2110a53f65c98170 Mon Sep 17 00:00:00 2001
From: palewire <b@palewi.re>
Date: Wed, 24 Jul 2024 09:38:46 -0400
Subject: [PATCH] We can't use the cache

---
 newshomepages/extract/hyperlinks.py | 11 ++++++++---
 newshomepages/extract/lighthouse.py |  4 +++-
 newshomepages/utils.py              | 12 ++++++++----
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/newshomepages/extract/hyperlinks.py b/newshomepages/extract/hyperlinks.py
index b056f4c0f8c..1aeb75237a3 100644
--- a/newshomepages/extract/hyperlinks.py
+++ b/newshomepages/extract/hyperlinks.py
@@ -32,7 +32,9 @@ def hyperlinks(
 ):
     """Download and parse the provided site's hyperlinks files."""
     # Get all hyperlink files
-    hyperlink_df = utils.get_hyperlink_df(verbose=True).sort_values(["handle", "date"])
+    hyperlink_df = utils.get_hyperlink_df(use_cache=False, verbose=True).sort_values(
+        ["handle", "date"]
+    )
 
     # Get the data we want
     if site:
@@ -52,9 +54,12 @@ def hyperlinks(
     filtered_df = hyperlink_df[hyperlink_df.handle.isin(handle_list)].copy()
 
     if days:
-        cutoff_date = filtered_df["date"].max() - pd.Timedelta(days=int(days))
+        max_date = filtered_df["date"].max()
+        cutoff_date = max_date - pd.Timedelta(days=int(days))
         filtered_df = filtered_df[filtered_df["date"] > cutoff_date].copy()
-        print(f"Trimming to last {days} days")
+        print(
+            f"Trimming to last {days} days from {cutoff_date:%Y-%m-%d} to {max_date:%Y-%m-%d}"
+        )
 
     # See how many files there are
     archived_files = set(filtered_df.url.unique())
diff --git a/newshomepages/extract/lighthouse.py b/newshomepages/extract/lighthouse.py
index 5a405041303..5cdadab1eff 100644
--- a/newshomepages/extract/lighthouse.py
+++ b/newshomepages/extract/lighthouse.py
@@ -32,7 +32,9 @@ def lighthouse(
 ):
     """Download and parse the provided site's Lighthouse files."""
     # Get all lighthouse files
-    lighthouse_df = utils.get_lighthouse_df().sort_values(["handle", "date"])
+    lighthouse_df = utils.get_lighthouse_df(use_cache=False, verbose=True).sort_values(
+        ["handle", "date"]
+    )
 
     # Get the data we want
     if site:
diff --git a/newshomepages/utils.py b/newshomepages/utils.py
index ad2d9c3042f..eb322eb3173 100644
--- a/newshomepages/utils.py
+++ b/newshomepages/utils.py
@@ -544,12 +544,14 @@ def get_hyperlink_list() -> list[dict[str, typing.Any]]:
     return get_hyperlink_df().to_dict(orient="records")
 
 
-def get_hyperlink_df(verbose: bool = False) -> pd.DataFrame:
+def get_hyperlink_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame:
     """Get the full list of hyperlink files from our extracts.
 
     Returns a DataFrame.
     """
-    return _get_extract_files_df("hyperlink-files.csv", verbose=verbose)
+    return _get_extract_files_df(
+        "hyperlink-files.csv", use_cache=use_cache, verbose=verbose
+    )
 
 
 def get_lighthouse_list() -> list[dict[str, typing.Any]]:
@@ -560,12 +562,14 @@ def get_lighthouse_list() -> list[dict[str, typing.Any]]:
     return get_lighthouse_df().to_dict(orient="records")
 
 
-def get_lighthouse_df() -> pd.DataFrame:
+def get_lighthouse_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame:
     """Get the full list of Lighthouse files from our extracts.
 
     Returns a DataFrame.
     """
-    return _get_extract_files_df("lighthouse-files.csv")
+    return _get_extract_files_df(
+        "lighthouse-files.csv", use_cache=use_cache, verbose=verbose
+    )
 
 
 def get_robotstxt_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame: