-
Notifications
You must be signed in to change notification settings - Fork 6
/
news_scraper.py
65 lines (55 loc) · 2.09 KB
/
news_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import requests
from bs4 import BeautifulSoup
import datetime
from AltException import InvalidQuery
class scraper():
def __init__(self, keyword: str, limit=10, time_=5):
"""Returns the scraped news headlines from google news."""
assert keyword != ""
if "stocks" not in keyword:
keyword+=" stocks"
self.time_=time_
self.count=0
self.url = "https://news.google.com/search?q={0}&hl=en-IN&gl=IN&ceid=IN:en"
self.query = self.url.format("%20".join(keyword.split()))
self.request_news = requests.get(self.query)
if self.request_news.status_code == 200:
self.link_extract()
def link_extract(self):
self.soup_news = BeautifulSoup(self.request_news.content, "html.parser")
self.results = []
for link in self.soup_news.find_all('div', class_="NiLAwe"):
a_tag = link.find_all('a')
if a_tag:
l = a_tag[0]['href']
title = link.find('h3').text
_time = link.find('time')
if _time is not None:
time_value = link.find('time')["datetime"]
else:
continue # pragma: no cover
now = datetime.datetime.now()
time = datetime.datetime.strptime(time_value, '%Y-%m-%dT%H:%M:%SZ')
if self.count==10:
break
if (now-time).days>self.time_:
continue # pragma: no cover
# depends on news
item = {
"title": title,
"link": "https://news.google.com/"+l,
"time": time_value
}
self.results.append(item)
self.count+=1
if self.count == 0: raise InvalidQuery
def get_title(self):
news = ''
for i in self.results:
news += ". " + i['title']
return news
# def pretty_print(self):
# data = json.dumps(self.results)
# f = open("data.json", "w")
# f.write(data)
# f.close()