From baa629cfb0feb8d49c5bb20a2110a53f65c98170 Mon Sep 17 00:00:00 2001 From: palewire Date: Wed, 24 Jul 2024 09:38:46 -0400 Subject: [PATCH] We can't use the cache --- newshomepages/extract/hyperlinks.py | 11 ++++++++--- newshomepages/extract/lighthouse.py | 4 +++- newshomepages/utils.py | 12 ++++++++---- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/newshomepages/extract/hyperlinks.py b/newshomepages/extract/hyperlinks.py index b056f4c0f8c..1aeb75237a3 100644 --- a/newshomepages/extract/hyperlinks.py +++ b/newshomepages/extract/hyperlinks.py @@ -32,7 +32,9 @@ def hyperlinks( ): """Download and parse the provided site's hyperlinks files.""" # Get all hyperlink files - hyperlink_df = utils.get_hyperlink_df(verbose=True).sort_values(["handle", "date"]) + hyperlink_df = utils.get_hyperlink_df(use_cache=False, verbose=True).sort_values( + ["handle", "date"] + ) # Get the data we want if site: @@ -52,9 +54,12 @@ def hyperlinks( filtered_df = hyperlink_df[hyperlink_df.handle.isin(handle_list)].copy() if days: - cutoff_date = filtered_df["date"].max() - pd.Timedelta(days=int(days)) + max_date = filtered_df["date"].max() + cutoff_date = max_date - pd.Timedelta(days=int(days)) filtered_df = filtered_df[filtered_df["date"] > cutoff_date].copy() - print(f"Trimming to last {days} days") + print( + f"Trimming to last {days} days from {cutoff_date:%Y-%m-%d} to {max_date:%Y-%m-%d}" + ) # See how many files there are archived_files = set(filtered_df.url.unique()) diff --git a/newshomepages/extract/lighthouse.py b/newshomepages/extract/lighthouse.py index 5a405041303..5cdadab1eff 100644 --- a/newshomepages/extract/lighthouse.py +++ b/newshomepages/extract/lighthouse.py @@ -32,7 +32,9 @@ def lighthouse( ): """Download and parse the provided site's Lighthouse files.""" # Get all lighthouse files - lighthouse_df = utils.get_lighthouse_df().sort_values(["handle", "date"]) + lighthouse_df = utils.get_lighthouse_df(use_cache=False, verbose=True).sort_values( + ["handle", "date"] + ) # Get the data we want if site: diff --git a/newshomepages/utils.py b/newshomepages/utils.py index ad2d9c3042f..eb322eb3173 100644 --- a/newshomepages/utils.py +++ b/newshomepages/utils.py @@ -544,12 +544,14 @@ def get_hyperlink_list() -> list[dict[str, typing.Any]]: return get_hyperlink_df().to_dict(orient="records") -def get_hyperlink_df(verbose: bool = False) -> pd.DataFrame: +def get_hyperlink_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame: """Get the full list of hyperlink files from our extracts. Returns a DataFrame. """ - return _get_extract_files_df("hyperlink-files.csv", verbose=verbose) + return _get_extract_files_df( + "hyperlink-files.csv", use_cache=use_cache, verbose=verbose + ) def get_lighthouse_list() -> list[dict[str, typing.Any]]: @@ -560,12 +562,14 @@ def get_lighthouse_list() -> list[dict[str, typing.Any]]: return get_lighthouse_df().to_dict(orient="records") -def get_lighthouse_df() -> pd.DataFrame: +def get_lighthouse_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame: """Get the full list of Lighthouse files from our extracts. Returns a DataFrame. """ - return _get_extract_files_df("lighthouse-files.csv") + return _get_extract_files_df( + "lighthouse-files.csv", use_cache=use_cache, verbose=verbose + ) def get_robotstxt_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame: