-
Notifications
You must be signed in to change notification settings - Fork 2
/
webscraper.py
38 lines (28 loc) · 1.32 KB
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import requests
from bs4 import BeautifulSoup
def create_course_list():
catalog = requests.get('https://apps.ualberta.ca/catalogue/course')
catalog_soup = BeautifulSoup(catalog.text, "html.parser")
# all urls for courses are in <li><a href=url>CMPUT courses</a></li>
li_elems = catalog_soup.find_all('li')
course_set = set()
course_number_list = list()
course_name_list = list()
for li in li_elems:
url_end = li.a.get('href')
new_url = "https://apps.ualberta.ca" + url_end
if url_end[0:18] == "/catalogue/course/":
courses = requests.get(new_url)
course_soup = BeautifulSoup(courses.text, "html.parser")
course_names = course_soup.find_all('h4', class_ = 'flex-grow-1')
for name in course_names:
name_lines = name.text.strip().splitlines()
course_number = name_lines[0].split('-')[0].strip()
course_name = name_lines[0].split('-')[1].strip()
if not (course_number in course_set):
course_set.add(course_number)
course_number_list.append(course_number)
course_name_list.append(course_name)
return(course_number_list,course_name_list)
if __name__ == '__main__':
print(create_course_list())