-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
74 lines (65 loc) · 2.86 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import imp
from queue import Empty
import requests
import crawlCbsNews
import crawlAbcNews
import crawlBloomberg
import crawlCnbcNews
import filter
from bs4 import BeautifulSoup
from fileHandler import createIgnoredByDefaultList
# baseUrls: list(str) base urls of the webpages one is interested in
# interestedIn: list(str) topics / keywords which MUST be in the url (ignored if empty)
# notInterestedIn: list(str) topics / keywords which MUST NOT be in the url (ignored if empty)
# return: list(str) filtered url list
def crawlAllRelevantUrls(baseUrls, interestedIn, notInterestedIn):
soups = getSoups(baseUrls)
urlsPerBaseUrls = list(map(lambda soup: crawlRelevantUrlsPerSoup(soup, interestedIn, notInterestedIn), soups))
return flattenUrlList(urlsPerBaseUrls, baseUrls)
def webpageNotSupported(url):
print("We are sorry, this webpage is not supported: " + url)
def flattenUrlList(urlsPerBaseUrls, baseUrls):
flatList = set([])
index = 0
for urls in urlsPerBaseUrls:
for url in urls:
if url.startswith("/"):
flatList.add(baseUrls[index] + url)
else:
flatList.add(url)
index += 1
return flatList
def getSoups(urls):
return list(map(lambda url: getSoupPerUrl(url), urls))
def getSoupPerUrl(url):
r = requests.get(url)
return BeautifulSoup(r.content, 'html.parser')
# soup: BeautifulSoup() source of urls
# interestedIn: list(str) topics / keywords which MUST be in the url (ignored if empty)
# notInterestedIn: list(str) topics / keywords which MUST NOT be in the url (ignored if empty)
# return: list(str) filtered url list
def crawlRelevantUrlsPerSoup(soup, interestedIn, notInterestedIn):
urls = set(map(lambda x: x['href'], soup.find_all('a', href=True)))
return filterUrls(urls, interestedIn, notInterestedIn)
def filterUrls(urls, interestedIn, notInterestedIn):
urls = filter.findStringsNotContainingKeywords(urls, createIgnoredByDefaultList())
if interestedIn != []:
urls = filter.findStringsContainingKeyword(urls, interestedIn)
if notInterestedIn != []:
urls = filter.findStringsNotContainingKeywords(urls, notInterestedIn)
return urls
def readNewsPage(url):
if url.startswith("https://www.cbsnews.com"):
# crawlCbsNews.printArticle(url)
crawlCbsNews.printRelatedArticles(url)
elif url.startswith("https://abcnews.go.com"):
# crawlAbcNews.printArticle(url)
crawlAbcNews.printRelatedArticles(url)
elif url.startswith("https://www.bloomberg.com"):
# crawlBloomberg.printArticle(url)
crawlBloomberg.printRelatedArticles(url)
elif url.startswith("https://www.cnbc.com"):
# crawlCnbcNews.printArticle(url)
crawlCnbcNews.printRelatedArticles(url)
else:
webpageNotSupported(url)