Try to clean up drudge analysis and cut blanks

palewire · Aug 26, 2023 · 0230275 · 0230275
1 parent 125b345
commit 0230275
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 1 deletion.
diff --git a/newshomepages/analyze/drudge.py b/newshomepages/analyze/drudge.py
@@ -58,7 +58,7 @@ def get_lemma(headline: str):
         doc = nlp(headline)
 
         # Parse out all the words
-        token_list = [token for token in doc]
+        token_list = [token.strip() for token in doc if token.strip()]
 
         # Remove stop words
         token_list = [t for t in token_list if not t.is_stop]
@@ -121,6 +121,8 @@ def get_lemma(headline: str):
         "RISE",
         "DEAD",
         "SET",
+        "HOUSE",  # This usually refers to the White House
+        "\n",
     ]
     qualified_df = word_df[
         (~word_df.part_of_speech.isin(["SYM", "VERB"]))
@@ -156,6 +158,7 @@ def get_top_verb(lemma: str) -> str:
             "HAVE",
             "MELONI",
             "ZERO",
+            "'",  # This is a weird one
         ]
         if lemma == "COVID":
             stop_verbs += ["TESTS", "TEST"]

diff --git a/newshomepages/site.py b/newshomepages/site.py
@@ -165,6 +165,7 @@ def drudge():
     out_dir = CHARTS_DIR / "drudge" / "top-words"
     out_dir.mkdir(parents=True, exist_ok=True)
     for d in track(dict_list):
+        print(d)
         _write_template(
             "drudge-top-words.svg", dict(obj=d), out_dir / f"{d['lemma'].lower()}.svg"
         )