From 0230275be84a5fc79b59b181f6e6ed894e95e02a Mon Sep 17 00:00:00 2001 From: palewire Date: Sat, 26 Aug 2023 19:50:32 -0400 Subject: [PATCH] Try to clean up drudge analysis and cut blanks --- newshomepages/analyze/drudge.py | 5 ++++- newshomepages/site.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/newshomepages/analyze/drudge.py b/newshomepages/analyze/drudge.py index a68ebb6093a..1abc0e41d86 100644 --- a/newshomepages/analyze/drudge.py +++ b/newshomepages/analyze/drudge.py @@ -58,7 +58,7 @@ def get_lemma(headline: str): doc = nlp(headline) # Parse out all the words - token_list = [token for token in doc] + token_list = [token.strip() for token in doc if token.strip()] # Remove stop words token_list = [t for t in token_list if not t.is_stop] @@ -121,6 +121,8 @@ def get_lemma(headline: str): "RISE", "DEAD", "SET", + "HOUSE", # This usually refers to the White House + "\n", ] qualified_df = word_df[ (~word_df.part_of_speech.isin(["SYM", "VERB"])) @@ -156,6 +158,7 @@ def get_top_verb(lemma: str) -> str: "HAVE", "MELONI", "ZERO", + "'", # This is a weird one ] if lemma == "COVID": stop_verbs += ["TESTS", "TEST"] diff --git a/newshomepages/site.py b/newshomepages/site.py index a3136c685d2..6fa038040f5 100644 --- a/newshomepages/site.py +++ b/newshomepages/site.py @@ -165,6 +165,7 @@ def drudge(): out_dir = CHARTS_DIR / "drudge" / "top-words" out_dir.mkdir(parents=True, exist_ok=True) for d in track(dict_list): + print(d) _write_template( "drudge-top-words.svg", dict(obj=d), out_dir / f"{d['lemma'].lower()}.svg" )