Skip to content

Commit a1c50bf

Browse files
psvenkdtemkin1
andauthored
Include classes that lack schedules + split off IAP/summer (#100)
Fixes #27 and #7. In the scraper, replace the check that a class has a schedule with a check that a class is offered in the current term (with the fall/IAP/spring check done in `catalog.py` while the academic year check is done in `fireroad.py`), and add proper support for IAP and summer terms. --------- Co-authored-by: Diego Temkin <65834932+dtemkin1@users.noreply.github.com>
1 parent 4e7ee3f commit a1c50bf

File tree

13 files changed

+245
-97
lines changed

13 files changed

+245
-97
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ dist-ssr
2626

2727
# artifacts
2828
scrapers/catalog.json
29-
scrapers/fireroad.json
29+
scrapers/fireroad-sem.json
30+
scrapers/fireroad-presem.json
3031
public/latest.json
32+
public/i25.json
3133

3234
# python
3335
__pycache__

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ Let's say you're updating from e.g. Spring 2023 to Fall 2023.
3838

3939
First, archive the old semester. Make sure you have updated schedule files. Then run `mv public/latest.json public/s23.json`.
4040

41-
Then, update the new semester. Open `public/latestTerm.json`, change `urlName` to `f23`, and update the dates per [Registrar](https://registrar.mit.edu/calendar).
41+
Then, update the new semester. Open `public/latestTerm.json`, change `urlName` to `m23` (for the "pre-semester" summer 2023) and `f23` (for the semester fall 2023), and update the dates per [Registrar](https://registrar.mit.edu/calendar).
42+
43+
Next, update the `.gitignore` to ignore `public/m23.json` rather than `public/i23.json`.
4244

4345
Finally, run the normal update process and commit the results to the repo.
4446

public/latestTerm.json

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
11
{
2-
"urlName": "s25",
3-
"startDate": "2025-02-03",
4-
"h1EndDate": "2025-03-21",
5-
"h2StartDate": "2025-03-31",
6-
"endDate": "2025-05-13",
7-
"mondayScheduleDate": "2025-02-18",
8-
"holidayDates": [
9-
"2025-02-17",
10-
"2025-03-24",
11-
"2025-03-25",
12-
"2025-03-26",
13-
"2025-03-27",
14-
"2025-03-28",
15-
"2025-04-21"
16-
]
2+
"preSemester": {
3+
"urlName": "i25",
4+
"startDate": "2025-01-06",
5+
"endDate": "2025-01-31",
6+
"holidayDates": [
7+
"2025-01-20"
8+
]
9+
},
10+
"semester": {
11+
"urlName": "s25",
12+
"startDate": "2025-02-03",
13+
"h1EndDate": "2025-03-21",
14+
"h2StartDate": "2025-03-31",
15+
"endDate": "2025-05-13",
16+
"mondayScheduleDate": "2025-02-18",
17+
"holidayDates": [
18+
"2025-02-17",
19+
"2025-03-24",
20+
"2025-03-25",
21+
"2025-03-26",
22+
"2025-03-27",
23+
"2025-03-28",
24+
"2025-04-21"
25+
]
26+
}
1727
}

scrapers/catalog.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@
2323
BASE_URL = "http://student.mit.edu/catalog"
2424

2525

26+
def is_not_offered_this_year(html):
27+
"""
28+
Args:
29+
* html (BeautifulSoup): the input webpage
30+
31+
Returns:
32+
* bool: True if the class is not offered this year
33+
"""
34+
if html.find(attrs={"src": "/icns/nooffer.gif"}):
35+
return True
36+
return False
37+
38+
2639
def is_not_offered_next_year(html):
2740
"""
2841
Args:
@@ -228,8 +241,9 @@ def scrape_courses_from_page(courses, href):
228241
filtered_html = BeautifulSoup()
229242
filtered_html.extend(content)
230243
course_data = get_course_data(filtered_html)
231-
for course_num in course_nums:
232-
courses[course_num] = course_data
244+
if not is_not_offered_this_year(filtered_html):
245+
for course_num in course_nums:
246+
courses[course_num] = course_data
233247

234248

235249
def run():

scrapers/fireroad.py

Lines changed: 78 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import json
2323
import requests
2424
import utils
25+
from utils import Term
2526

2627
URL = "https://fireroad.mit.edu/courses/all?full=true"
2728

@@ -88,20 +89,19 @@ def parse_section(section):
8889
return [slots, place]
8990

9091

91-
def parse_schedule(course):
92+
def parse_schedule(schedule):
9293
"""
9394
Parses the schedule string, which looks like:
9495
"Lecture,32-123/TR/0/11/F/0/2;Recitation,2-147/MW/0/10,2-142/MW/0/11"
9596
9697
Args:
97-
* course (dict[str, Union[bool, float, int, list[str], str]]): The course object.
98+
* schedule (str): The schedule string.
9899
99100
Returns:
100101
* dict[str, union[list, bool]: The parsed schedule
101102
102103
Raises AssertionError or KeyError if parse_section does.
103104
"""
104-
schedule = course["schedule"]
105105
section_tba = False
106106
result = {}
107107

@@ -205,18 +205,19 @@ def parse_prereqs(course):
205205
return {"prereqs": prereqs}
206206

207207

208-
def get_course_data(courses, course):
208+
def get_course_data(courses, course, term):
209209
"""
210210
Parses a course from the Fireroad API, and puts it in courses. Skips the
211-
courses Fireroad doesn't have schedule info for. Returns False if skipped,
211+
courses that are not offered in the current term. Returns False if skipped,
212212
True otherwise. The `courses` variable is modified in place.
213213
214214
Args:
215215
* courses (list[dict[str, Union[bool, float, int, list[str], str]]]): The list of courses.
216216
* course (dict[str, Union[bool, float, int, list[str], str]]): The course in particular.
217+
* term (Term): The current term (fall, IAP, or spring).
217218
218219
Returns:
219-
* bool: Whether Fireroad has schedule information for this course.
220+
* bool: Whether the course was entered into courses.
220221
"""
221222
course_code = course["subject_id"]
222223
course_num, course_class = course_code.split(".")
@@ -226,41 +227,72 @@ def get_course_data(courses, course):
226227
"subject": course_class,
227228
}
228229

229-
if "schedule" not in course:
230-
# TODO: Do something else with this?
231-
return False
230+
# terms, prereqs
231+
raw_class.update(parse_terms(course))
232+
raw_class.update(parse_prereqs(course))
232233

233-
# tb, s, l, r, b, lr, rr, br
234-
try:
235-
raw_class.update(parse_schedule(course))
236-
except Exception as e:
237-
# if we can't parse the schedule, warn
238-
print(f"Can't parse schedule {course_code}: {e!r}")
234+
if term.name not in raw_class["terms"]:
239235
return False
240236

241-
# hh, ha, hs, he, ci, cw, re, la, pl
237+
has_schedule = "schedule" in course
238+
239+
# tba, sectionKinds, lectureSections, recitationSections, labSections,
240+
# designSections, lectureRawSections, recitationRawSections, labRawSections,
241+
# designRawSections
242+
if has_schedule:
243+
try:
244+
if term == Term.FA and "scheduleFall" in course:
245+
raw_class.update(parse_schedule(course["scheduleFall"]))
246+
elif term == Term.JA and "scheduleIAP" in course:
247+
raw_class.update(parse_schedule(course["scheduleIAP"]))
248+
elif term == Term.SP and "scheduleSpring" in course:
249+
raw_class.update(parse_schedule(course["scheduleSpring"]))
250+
else:
251+
raw_class.update(parse_schedule(course["schedule"]))
252+
except Exception as e:
253+
# if we can't parse the schedule, warn
254+
print(f"Can't parse schedule {course_code}: {e!r}")
255+
has_schedule = False
256+
if not has_schedule:
257+
raw_class.update(
258+
{
259+
"tba": False,
260+
"sectionKinds": [],
261+
"lectureSections": [],
262+
"recitationSections": [],
263+
"labSections": [],
264+
"designSections": [],
265+
"lectureRawSections": [],
266+
"recitationRawSections": [],
267+
"labRawSections": [],
268+
"designRawSections": [],
269+
}
270+
)
271+
272+
# hassH, hassA, hassS, hassE, cih, cihw, rest, lab, partLab
242273
raw_class.update(parse_attributes(course))
243-
raw_class.update(
244-
{
245-
"lectureUnits": course["lecture_units"],
246-
"labUnits": course["lab_units"],
247-
"preparationUnits": course["preparation_units"],
248-
"level": course["level"],
249-
"isVariableUnits": course["is_variable_units"],
250-
"same": ", ".join(course.get("joint_subjects", [])),
251-
"meets": ", ".join(course.get("meets_with_subjects", [])),
252-
}
253-
)
254-
# This should be the case with variable-units classes, but just to make sure.
274+
try:
275+
raw_class.update(
276+
{
277+
"lectureUnits": course["lecture_units"],
278+
"labUnits": course["lab_units"],
279+
"preparationUnits": course["preparation_units"],
280+
"level": course["level"],
281+
"isVariableUnits": course["is_variable_units"],
282+
"same": ", ".join(course.get("joint_subjects", [])),
283+
"meets": ", ".join(course.get("meets_with_subjects", [])),
284+
}
285+
)
286+
except KeyError as e:
287+
print(f"Can't parse {course_code}: {e!r}")
288+
return False
289+
# This should be the case with variable-units classes, but just to make
290+
# sure.
255291
if raw_class["isVariableUnits"]:
256292
assert raw_class["lectureUnits"] == 0
257293
assert raw_class["labUnits"] == 0
258294
assert raw_class["preparationUnits"] == 0
259295

260-
# t, pr
261-
raw_class.update(parse_terms(course))
262-
raw_class.update(parse_prereqs(course))
263-
264296
raw_class.update(
265297
{
266298
"description": course.get("description", ""),
@@ -271,7 +303,7 @@ def get_course_data(courses, course):
271303
}
272304
)
273305

274-
# nx, rp, u, f, hf, lm are from catalog.json, not here
306+
# nonext, repeat, url, final, half, limited are from catalog.json, not here
275307

276308
if "old_id" in course:
277309
raw_class["oldNumber"] = course["old_id"]
@@ -289,27 +321,33 @@ def get_course_data(courses, course):
289321
return True
290322

291323

292-
def run():
324+
def run(is_semester_term):
293325
"""
294326
The main entry point. All data is written to `fireroad.json`.
295327
296-
There are no arguments and there is no return value.
328+
Args:
329+
* is_semester_term (bool): whether to look at the semester term (fall/spring) or the pre-semester term (summer/IAP).
330+
331+
Returns: none
297332
"""
298333
text = requests.get(URL).text
299334
data = json.loads(text)
300335
courses = dict()
336+
term = utils.url_name_to_term(utils.get_term_info(is_semester_term)["urlName"])
337+
fname = "fireroad-sem.json" if is_semester_term else "fireroad-presem.json"
301338
missing = 0
302339

303340
for course in data:
304-
has_schedule = get_course_data(courses, course)
305-
if not has_schedule:
341+
included = get_course_data(courses, course, term)
342+
if not included:
306343
missing += 1
307344

308-
with open("fireroad.json", "w") as f:
345+
with open(fname, "w") as f:
309346
json.dump(courses, f)
310347
print(f"Got {len (courses)} courses")
311-
print(f"Skipped {missing} courses due to missing schedules")
348+
print(f"Skipped {missing} courses that are not offered in the {term.value} term")
312349

313350

314351
if __name__ == "__main__":
315-
run()
352+
run(False)
353+
run(True)

scrapers/package.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,29 +58,46 @@ def run():
5858
Takes data from fireroad.json and catalog.json; outputs latest.json.
5959
There are no arguments and no return value.
6060
"""
61-
fireroad = load_json_data("fireroad.json")
61+
fireroad_presem = load_json_data("fireroad-presem.json")
62+
fireroad_sem = load_json_data("fireroad-sem.json")
6263
catalog = load_json_data("catalog.json")
6364
overrides = load_json_data("overrides.json")
6465

6566
# The key needs to be in BOTH fireroad and catalog to make it:
66-
# If it's not in Fireroad, we don't have its schedule.
67-
# If it's not in catalog, it's not offered this semester.
68-
courses = merge_data(
69-
datasets=[fireroad, catalog, overrides],
70-
keys_to_keep=set(fireroad) & set(catalog),
67+
# If it's not in Fireroad, it's not offered in this semester (fall, etc.).
68+
# If it's not in catalog, it's not offered this year.
69+
courses_presem = merge_data(
70+
datasets=[fireroad_presem, catalog, overrides],
71+
keys_to_keep=set(fireroad_presem) & set(catalog),
72+
)
73+
courses_sem = merge_data(
74+
datasets=[fireroad_sem, catalog, overrides],
75+
keys_to_keep=set(fireroad_sem) & set(catalog),
7176
)
7277

73-
term_info = utils.get_term_info()
78+
term_info_presem = utils.get_term_info(False)
79+
url_name_presem = term_info_presem["urlName"]
80+
term_info_sem = utils.get_term_info(True)
81+
url_name_sem = term_info_sem["urlName"]
7482
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
75-
obj = {
76-
"termInfo": term_info,
83+
84+
obj_presem = {
85+
"termInfo": term_info_presem,
86+
"lastUpdated": now,
87+
"classes": courses_presem,
88+
}
89+
obj_sem = {
90+
"termInfo": term_info_sem,
7791
"lastUpdated": now,
78-
"classes": courses,
92+
"classes": courses_sem,
7993
}
8094

95+
with open(f"../public/{url_name_presem}.json", mode="w", encoding="utf-8") as f:
96+
json.dump(obj_presem, f, separators=(",", ":"))
8197
with open("../public/latest.json", mode="w", encoding="utf-8") as f:
82-
json.dump(obj, f, separators=(",", ":"))
83-
print(f"Got {len(courses)} courses")
98+
json.dump(obj_sem, f, separators=(",", ":"))
99+
print(f"{url_name_presem}: got {len(courses_presem)} courses")
100+
print(f"{url_name_sem}: got {len(courses_sem)} courses")
84101

85102

86103
if __name__ == "__main__":

scrapers/update.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ def run():
1616
"""
1717
This function is the entry point. There are no arguments.
1818
"""
19-
print("=== Update fireroad data ===")
20-
fireroad.run()
19+
print("=== Update fireroad data (pre-semester) ===")
20+
fireroad.run(False)
21+
print("=== Update fireroad data (semester) ===")
22+
fireroad.run(True)
2123
print("=== Update catalog data ===")
2224
catalog.run()
2325
print("=== Packaging ===")

0 commit comments

Comments
 (0)