-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_data.py
54 lines (42 loc) · 1.7 KB
/
scrape_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# this file is not needed if the scraped data is already saved in a file
from bs4 import BeautifulSoup
from urllib.request import urlopen
from spell import Spell
def scrape_spell(url):
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
# get div with class page-title
div = soup.find("div", class_="page-title")
name = div.text.strip()
print(name)
# get div with id "page-content"
content = soup.find("div", {"id": "page-content"})
# remove the last <p> tag
content.find_all("p")[-1].decompose()
# make all hyperlinks into normal text (no longer clickable)
for link in content.find_all("a"):
link.replaceWithChildren()
# decrease bottom margin of first two <p> tags
content.find_all("p")[0]["style"] = "margin-bottom: 8px;"
content.find_all("p")[1]["style"] = "margin-bottom: 8px;"
# add a line with the url
# hyperlink
url_line = soup.new_tag("a", href=url)
url_line.string = url
# make hyperlink less blue
url_line["style"] = "color: #555555;"
# add to end of content
content.append(url_line)
return Spell(name, str(content))
def scrape_class(class_name):
url = f"http://dnd5e.wikidot.com/spells:{class_name}"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
spell_urls = []
for tab in range(len(soup.find_all("div", {"id": lambda x: x and "wiki-tab-0-" in x}))):
spell_list = soup.find_all("div", {"id": f"wiki-tab-0-{tab}"})
for row in spell_list[0].find_all("tr")[1:]:
spell_urls.append("http://dnd5e.wikidot.com" + row.find("a")["href"])
return spell_urls