-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_scrape_words.py
executable file
·53 lines (41 loc) · 1.65 KB
/
web_scrape_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/python3
import os
import shutil
import time
import requests
from bs4 import BeautifulSoup
# list of pages containing words with different themes
# link : https://www.enchantedlearning.com/wordlist/
def get_words_create_files() -> dict :
'''
web scraping to get words & store in separate files based on theme
returns a dictionary where
keys : themes
values : list of words
'''
words_page_names = ["astronomy", "positivewords", "languages", "metals", "usstates", "vegetables"]
synonym_words_page_names = ["big", "happy", "said"]
words_dict = {}
# make directory to store words in files if does not exist already
if not os.path.isdir('words/') :
os.mkdir('words/')
else:
shutil.rmtree('words/')
time.sleep(3) #time to delete files in dir & the dir itsellf
os.mkdir('words/')
for page_name in words_page_names + synonym_words_page_names :
# get URL request
URL = "https://www.enchantedlearning.com/wordlist/" + page_name + ".shtml"
page = requests.get(URL)
# get page content in html format
# aim is to get class names & type of tags
soup = BeautifulSoup(page.content, "html.parser")
words_list = []
for sp in soup.find_all("div",class_="wordlist-item"):
words_list.append(sp.text.lower()) #lower text for game purposes
words_dict[page_name] = words_list
# write all words with same theme in separate file
with open('words/' + page_name + '.txt', "w") as file:
for word in words_list:
file.write(word + '\n')
return words_dict