-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_utils.py
94 lines (80 loc) · 2.68 KB
/
news_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import logging as log
import os
from datetime import datetime, time
from time import sleep
import bs4
import pymongo
import requests
from newsapi import NewsApiClient
log.basicConfig(level=log.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
def get_news_api_key():
home = os.path.expanduser("~")
with open(f"{home}/keys/google-news-api/news_api_key", "r") as f:
news_api_key = f.read().rstrip("\n")
return news_api_key
def get_google_news_sources():
with open(f"./configs/google_news_sources", "r") as f:
sources = f.read().strip()
return sources
def get_google_news(sources, from_param=None):
api_key = get_news_api_key()
client = NewsApiClient(api_key)
if from_param is None:
from_param = datetime.combine(datetime.today(), time.min).strftime(
"%Y-%m-%dT%H:%M:%S"
)
try:
news = client.get_everything(
sources=sources, from_param=from_param, language="en"
)
except requests.exceptions.ReadTimeout as e:
log.warning(f"requests.exceptions.ReadTimeout - retrying after 5 minutes")
sleep(300)
news = client.get_everything(
sources=sources, from_param=from_param, language="en"
)
now = datetime.now()
news["_id"] = int(now.strftime("%Y%m%d%H%M"))
return news
def get_wiki_current_events():
now = datetime.now()
today_date = now.strftime("%Y_%B_%-d")
url = "https://en.m.wikipedia.org/wiki/Portal:Current_events"
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content, "html.parser")
anchors = soup.find_all("a")
for anchor in anchors:
anchor.replace_with_children()
today_block = soup.find(id=today_date)
if today_block is None:
return None
else:
html = today_block.contents[-1].renderContents().decode()
return {
"_id": int(now.strftime("%Y%m%-d")),
"date": now.strftime("%-d %b %Y"),
"text": html,
}
def query_wiki_current_events(ts=None):
if ts is None:
ts = datetime.now()
doc_id = int(str(ts.date()).replace("-", ""))
client = pymongo.MongoClient()
db = client["wiki"]
collection = db["currentEvents"]
doc = collection.find_one({"_id": doc_id})
return doc
def query_news_articles(**config):
count = config["articles"]
client = pymongo.MongoClient()
db = client["googlenews"]
collection = db["articles"]
docs = collection.find(sort=[("_id", -1)])
articles = []
for doc in docs:
doc_articles = doc["articles"]
for article in doc_articles:
articles.append(article)
count -= 1
if count == 0:
return articles