Skip to content

Commit

Permalink
We can't use the cache
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 24, 2024
1 parent 696a332 commit baa629c
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 8 deletions.
11 changes: 8 additions & 3 deletions newshomepages/extract/hyperlinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def hyperlinks(
):
"""Download and parse the provided site's hyperlinks files."""
# Get all hyperlink files
hyperlink_df = utils.get_hyperlink_df(verbose=True).sort_values(["handle", "date"])
hyperlink_df = utils.get_hyperlink_df(use_cache=False, verbose=True).sort_values(
["handle", "date"]
)

# Get the data we want
if site:
Expand All @@ -52,9 +54,12 @@ def hyperlinks(
filtered_df = hyperlink_df[hyperlink_df.handle.isin(handle_list)].copy()

if days:
cutoff_date = filtered_df["date"].max() - pd.Timedelta(days=int(days))
max_date = filtered_df["date"].max()
cutoff_date = max_date - pd.Timedelta(days=int(days))
filtered_df = filtered_df[filtered_df["date"] > cutoff_date].copy()
print(f"Trimming to last {days} days")
print(
f"Trimming to last {days} days from {cutoff_date:%Y-%m-%d} to {max_date:%Y-%m-%d}"
)

# See how many files there are
archived_files = set(filtered_df.url.unique())
Expand Down
4 changes: 3 additions & 1 deletion newshomepages/extract/lighthouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def lighthouse(
):
"""Download and parse the provided site's Lighthouse files."""
# Get all lighthouse files
lighthouse_df = utils.get_lighthouse_df().sort_values(["handle", "date"])
lighthouse_df = utils.get_lighthouse_df(use_cache=False, verbose=True).sort_values(
["handle", "date"]
)

# Get the data we want
if site:
Expand Down
12 changes: 8 additions & 4 deletions newshomepages/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,12 +544,14 @@ def get_hyperlink_list() -> list[dict[str, typing.Any]]:
return get_hyperlink_df().to_dict(orient="records")


def get_hyperlink_df(verbose: bool = False) -> pd.DataFrame:
def get_hyperlink_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame:
"""Get the full list of hyperlink files from our extracts.
Returns a DataFrame.
"""
return _get_extract_files_df("hyperlink-files.csv", verbose=verbose)
return _get_extract_files_df(
"hyperlink-files.csv", use_cache=use_cache, verbose=verbose
)


def get_lighthouse_list() -> list[dict[str, typing.Any]]:
Expand All @@ -560,12 +562,14 @@ def get_lighthouse_list() -> list[dict[str, typing.Any]]:
return get_lighthouse_df().to_dict(orient="records")


def get_lighthouse_df() -> pd.DataFrame:
def get_lighthouse_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame:
"""Get the full list of Lighthouse files from our extracts.
Returns a DataFrame.
"""
return _get_extract_files_df("lighthouse-files.csv")
return _get_extract_files_df(
"lighthouse-files.csv", use_cache=use_cache, verbose=verbose
)


def get_robotstxt_df(use_cache: bool = True, verbose: bool = False) -> pd.DataFrame:
Expand Down

0 comments on commit baa629c

Please sign in to comment.