Skip to content

Commit

Permalink
More logging
Browse files Browse the repository at this point in the history
  • Loading branch information
palewire committed Jul 19, 2024
1 parent cd3199a commit 6be5d16
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions newshomepages/analyze/drudge.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,11 @@ def drudge_hyperlinks(output_dir: str = "./"):
],
dtype=str,
parse_dates=["date"],
low_memory=True,
)

# Trim the strings
print("Trimming strings")
df["text"] = (
df.text.str.strip()
.str.replace(r"\s{2,}", " ", regex=True)
Expand All @@ -296,6 +298,7 @@ def drudge_hyperlinks(output_dir: str = "./"):
df["url"] = df.url.str.strip()

# Guess links with `storysniffer`
print("Sniffing out stories")
sniffer = storysniffer.StorySniffer()
links_df = (
df.sort_values("date")
Expand All @@ -310,6 +313,7 @@ def drudge_hyperlinks(output_dir: str = "./"):
)

# Make some corrections
print("Applying our corrections")
blacklist = [
"/privacy/",
]
Expand Down Expand Up @@ -345,6 +349,7 @@ def drudge_hyperlinks(output_dir: str = "./"):
)

# Write the result
print("Writing out the results")
links_df.sort_values(
["domain", "earliest_date", "text"], ascending=[True, False, True]
).to_csv(output_path / "drudge-hyperlinks-analysis.csv", index=False)

0 comments on commit 6be5d16

Please sign in to comment.