Skip to content

Commit

Permalink
overhaul scrapers/math_dept.py (#91)
Browse files Browse the repository at this point in the history
This is major overhaul of `scrapers/math_dept.py`. Even though I know it
isn't actually being used in production, I still think it's good to make
all the code as pretty and modular as possible for posterity.

More specifically, the main code in `math_dept.py` (all the random stuff
that used to be non-indented) is now divided into:

* `run()`, the main entry point, which uses three other helper
functions:
  * `get_rows()` for scraping
  * `parse_subject(subject)` to parse the title specifically
  * `parse_row(row)` to parse an individual row
* `test_parse_when()` as a sanity check that `parse_when(when)` works
properly
  • Loading branch information
duck-master authored Dec 7, 2024
1 parent acf26ba commit 018c5d9
Showing 1 changed file with 91 additions and 23 deletions.
114 changes: 91 additions & 23 deletions scrapers/math_dept.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
"""
This isn't run automatically, but it is a temporary workaround to the math classes being wrong.
Used to generate the math overrides in package.py.
There is a large amount of main code.
Was used to generate the math overrides in package.py; currently unnecessary.
Functions:
* parse_when(when)
* test_parse_when()
* parse_many_timeslots(days, times)
* make_raw_sections(days, times, room):
* make_section_override(timeslots, room)
* get_rows()
* parse_subject(subject)
* parse_row(row)
* run()
"""

from bs4 import BeautifulSoup
from fireroad import parse_timeslot, parse_section
from pprint import pprint
from bs4 import BeautifulSoup
import requests

# TODO: move this huge wall of code into its own function

response = requests.get("https://math.mit.edu/academics/classes.html")
soup = BeautifulSoup(response.text, features="lxml")
course_list = soup.find("ul", {"class": "course-list"})
rows = course_list.findAll("li", recursive=False)
from fireroad import parse_timeslot, parse_section


def parse_when(when):
Expand Down Expand Up @@ -48,9 +45,17 @@ def parse_when(when):
return days, times


assert parse_when("F10:30-12") == ("F", "10.30-12")
assert parse_when("MW1") == ("MW", "1")
assert parse_when("MWF11") == ("MWF", "11")
def test_parse_when():
"""
Test cases for parse_when
Args: none
Returns: none
"""
assert parse_when("F10:30-12") == ("F", "10.30-12")
assert parse_when("MW1") == ("MW", "1")
assert parse_when("MWF11") == ("MWF", "11")


def parse_many_timeslots(days, times):
Expand All @@ -59,13 +64,13 @@ def parse_many_timeslots(days, times):
Args:
* day (str): A list of days
* times (str): The timesloot
* times (str): The timeslot
Returns:
* list[list[int]]: All of the parsed timeslots, as a list
"""
# parse timeslot wants only one letter
return [parse_timeslot(day, times) for day in days]
return [parse_timeslot(day, times, False) for day in days]


def make_raw_sections(days, times, room):
Expand All @@ -88,7 +93,7 @@ def make_section_override(timeslots, room):
Makes a section override
Args:
* timeslots
* timeslots (list[list[int]]): The timeslots of the section
* room (str): The room
Returns:
Expand All @@ -99,10 +104,32 @@ def make_section_override(timeslots, room):
# return [[section, room] for section in timeslots]


overrides = {}
def get_rows():
"""
Scrapes rows from https://math.mit.edu/academics/classes.html
Args: none
for row in rows:
subject = row.find("div", {"class": "subject"}).text
Returns:
* bs4.element.ResultSet: The rows of the table listing classes
"""
response = requests.get("https://math.mit.edu/academics/classes.html", timeout=1)
soup = BeautifulSoup(response.text, features="lxml")
course_list = soup.find("ul", {"class": "course-list"})
rows = course_list.findAll("li", recursive=False)
return rows


def parse_subject(subject):
"""
Parses the subject
Args:
* subject (str): The subject name to parse
Returns:
* subjects (list[str]): A clean list of subjects corresponding to that subject.
"""
# remove "J" from joint subjects
subject = subject.replace("J", "")

Expand All @@ -115,23 +142,64 @@ def make_section_override(timeslots, room):
subjects = [subject]
assert ["/" not in subject for subject in subjects]

return subjects


def parse_row(row):
"""
Parses the provided row
Args:
* row (bs4.element.Tag): The row that needs to be parsed.
Returns:
* dict[str, dict[str, list[Union[list[list[int]], str]]]]: The parsed row
"""
result = {}

subject = row.find("div", {"class": "subject"}).text
subjects = parse_subject(subject)

where_when = row.find("div", {"class": "where-when"})
when, where = where_when.findAll("div", recursive=False)
where = where.text
when = when.text
if ";" in when:
# Don't want to handle special case - calculus, already right
continue
return {}
days, times = parse_when(when)
timeslots = parse_many_timeslots(days, times)
for subject in subjects:
lecture_raw_sections = make_raw_sections(days, times, where)
lecture_sections = make_section_override(timeslots, where)
overrides[subject] = {
result[subject] = {
"lectureRawSections": lecture_raw_sections,
"lectureSections": lecture_sections,
}
# Make sure the raw thing that I do not comprehend is actually correct
assert parse_section(lecture_raw_sections) == lecture_sections[0]
return result


def run():
"""
The main entry point
Args: none
Returns:
* dict[str, dict[str, list[Union[list[list[int]], str]]]]: All the schedules
"""
rows = get_rows()
overrides = {}

for row in rows:
parsed_row = parse_row(row)
overrides.update(parsed_row)

return overrides


pprint(overrides)
if __name__ == "__main__":
test_parse_when()
pprint(run())

0 comments on commit 018c5d9

Please sign in to comment.