-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
132 lines (104 loc) · 3.48 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Author:
Amin Dziri
"""
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from typing import Type
from bs4 import BeautifulSoup
from pymongo import MongoClient
import datetime
class Scraper():
def scrape(self, url: str) -> list:
"""
Scrapes titles, text and links from articles on the given url webpage and saves them to a NoSQL DB
Args:
url (str): URL to a webpage
Returns:
/
Raises:
/
"""
driver = self.__initialize_driver()
driver.get(url)
time.sleep(3)
#Check if cookie pop up needs to be accepted
try:
driver.find_element(By.NAME, "agree").click()
except:
print("No Cookies to accept")
driver.maximize_window()
# Simulate continuous scrolling
stop_scrolling = 0
while True:
stop_scrolling += 1
driver.execute_script("window.scrollBy(0,40)")
time.sleep(0.5)
if stop_scrolling > 400:
break
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")
divs = soup.find_all("div")
filtered_divs = [div for div in divs if div.h3 and len(div.h3.text) > 20]
for div in filtered_divs:
try:
title = div.h3.text
except:
title = "/"
print("The crawled entry doesn't have a title")
continue
try:
link = div.a["href"]
except:
link = "/"
print("The crawled entry doesn't have a link")
self._save_to_db(title, link)
driver.close()
def _save_to_db(self, title: str, link: str) -> None:
"""
Saves a datapoint consisting of article title and article link
to a MongoDB Database (NoSQL)
Args:
title (str): Title of the article
link (str): Link of the article
Returns:
/
Raises:
/
"""
entry = {
"title": title,
"link": link,
"date": datetime.datetime.now()
}
#Change this to your own Database & Collection
client = MongoClient()
yahoo_finance_db = client.YahooFinanceDB
articles_collection = yahoo_finance_db.Articles
#Check if the articles is already in the database
if articles_collection.find_one({"title": entry["title"]}) == None:
articles_collection.insert_one(entry).inserted_id
def __initialize_driver(self) -> webdriver:
"""
initializes the chrome webdriver for selenium
Args:
/
Returns:
driver (WebDriver): The initialised webdriver
Raises:
/
"""
options = Options()
try:
options.add_argument("--disable-search-engine-choice-screen")
options.add_argument("--no-sandbox")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
except:
print("An Unknown error has occured")
driver = webdriver.Chrome(options=options)
return driver