-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstudienplan_to_json.py
441 lines (398 loc) · 17.1 KB
/
studienplan_to_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
import enum
import json
import re
import subprocess
import sys
import dateutil.parser
class GermanParserInfo(dateutil.parser.parserinfo):
MONTHS = [
("Jan", "Januar", "Jänner"),
("Feb", "Februar"),
("Mär", "Mrz", "März"),
("Apr", "April"),
("Mai",),
("Jun", "Juni"),
("Jul", "Juli"),
("Aug", "August"),
("Sep", "Sept", "September"),
("Okt", "Oktober"),
("Nov", "November"),
("Dez", "Dezember"),
]
class State(enum.Enum):
PREAMBLE = 1
STUDIUM_TYPE = 2
STUDIUM_NAME = 3
STUDIUM_KENNZAHL = 4
BESCHLUSS_DATUM = 5
GUELTIG_DATUM = 6
INHALTSVERZEICHNIS = 7
PRUEFUNGSFACH_MODUL_LVA = 12
PRUEFUNGSFAECHER = 13
PRUEFUNGSFACH_NAME = 14
PRUEFUNGSFACH_MODUL = 15
KURZBESCHREIBUNG_MODULE = 16
MODULBESCHREIBUNGEN = 17
MODUL_NAME = 18
MODUL_REGELARBEITSAUFWAND = 19
MODUL_LERNERGEBNISSE = 20
MODUL_LVAS = 21
LEHRVERANSTALTUNGSTYPEN = 23
SEMESTEREINTEILUNG = 24
SEMESTEREINTEILUNG_SEMESTER = 25
SEMESTEREINTEILUNG_LVA = 26
SEMESTEREMPFEHLUNG_SCHIEFEINSTEIGEND = 27
END = 99
def next_line(lines, skip_empty=True, strip=True):
"""Returns the next not empty line."""
while True:
line = next(lines)
if strip:
line = line.strip()
if line or not skip_empty:
return line
def parse_studienplan(text):
state = State.PREAMBLE
lines = iter(text.splitlines())
studienplan = {}
try:
line = next_line(lines)
while True:
if state == State.PREAMBLE:
if line.startswith("Bachelorstudium") or line.startswith(
"Masterstudium"
):
state = State.STUDIUM_TYPE
else:
line = next_line(lines)
elif state == State.STUDIUM_TYPE:
studienplan["studium_type"] = line
state = State.STUDIUM_NAME
line = next_line(lines)
elif state == State.STUDIUM_NAME:
studienplan["studium_name"] = line
state = State.STUDIUM_KENNZAHL
line = next_line(lines)
elif state == State.STUDIUM_KENNZAHL:
studienplan["studienkennzahl"] = line.replace(" ", "")
state = State.BESCHLUSS_DATUM
line = next_line(lines)
elif state == State.BESCHLUSS_DATUM:
if line.startswith("mit Wirksamkeit"):
studienplan["beschluss_datum"] = dateutil.parser.parse(
line.replace("mit Wirksamkeit ", ""), GermanParserInfo()
).date()
state = State.GUELTIG_DATUM
line = next_line(lines)
elif state == State.GUELTIG_DATUM:
assert line.startswith("Gültig ab")
studienplan["gueltig_datum"] = dateutil.parser.parse(
line.replace("Gültig ab ", ""), GermanParserInfo()
).date()
state = State.INHALTSVERZEICHNIS
line = next_line(lines)
elif state == State.INHALTSVERZEICHNIS:
# A lot of text inbetween is skipped.
if line.startswith("A. Modulbeschreibungen"):
state = State.MODULBESCHREIBUNGEN
line = next_line(lines)
elif state == State.MODULBESCHREIBUNGEN:
if line.endswith("ist in Anhang B im Detail erläutert."):
studienplan["modulbeschreibungen"] = []
modulbeschreibungen = studienplan["modulbeschreibungen"]
state = State.MODUL_NAME
line = next_line(lines)
elif state == State.MODUL_NAME:
if line.startswith("B. Lehrveranstaltungstypen"):
state = State.LEHRVERANSTALTUNGSTYPEN
else:
modul = {
"name": line.strip(),
"lvas": [],
"regelarbeitsaufwand": {"ects": None},
"lernergebnisse": [],
}
modulbeschreibungen.append(modul)
state = State.MODUL_REGELARBEITSAUFWAND
line = next_line(lines)
elif state == State.MODUL_REGELARBEITSAUFWAND:
if line.startswith("Regelarbeitsaufwand:"):
modul["regelarbeitsaufwand"]["ects"] = line.replace(
"Regelarbeitsaufwand: ", ""
).replace(" ECTS", "")
line = next_line(lines)
state = State.MODUL_LERNERGEBNISSE
elif state == State.MODUL_LERNERGEBNISSE:
if line.startswith("Lehrveranstaltungen des Moduls:"):
state = State.MODUL_LVAS
line = next_line(lines, strip=False)
elif line.endswith("Individuell nach gewählten Modulen/LVAs."):
# Bachelor Technische Informatik has two Module that do not have a
# list of LVAs.
state = State.MODUL_NAME
line = next_line(lines)
else:
modul["lernergebnisse"].append(line)
line = next_line(lines)
# Stay in the same state to potentially add another line to
# Lernergebnisse.
continue
# Lernergebnisse is fully parsed.
modul["lernergebnisse"] = (
"\n".join(modul["lernergebnisse"])
.replace("Lernergebnisse:", "")
.strip()
)
elif state == State.MODUL_LVAS:
# Line is not stripped so we can distinguish between continuing
# LVA name, new LVA name as well as new modules.
if re.match(r"^((?:\*|\s)\s*\d|\d\d)[,.]\d", line):
# The Modul "Software Engineering und Projektmanagement" in
# Medizinische Informatik has a special rule.
lva = re.match(
r"(?:\*\s*)?(?P<ects>\d{1,2}[,.]\d)/(?P<sst>\d{1,2}[,.]\d)\s*"
+ r"(?P<lva_typ>[A-Z]+)\s+(?P<name>.*)",
line.strip(),
).groupdict()
# Normalize spaces in name.
lva["name"] = re.sub("\s+", " ", lva["name"])
modul["lvas"].append(lva)
line = next_line(lines, strip=False)
elif line.startswith(" ") and line.strip():
# LVA name goes over two lines.
modul["lvas"][-1]["name"] += " " + line.strip()
line = next_line(lines, strip=False)
elif "zentralen Wahlfachkatalog der TU Wien" in line:
# The Modul "Freie Wahlfächer und Transferable Skills" doesn't have
# a list of LVAs. Just skip the description.
line = next_line(lines)
state = State.MODUL_NAME
elif len(modul["lvas"]) == 0 or line in ["Verpflichtend:", "Wahl:"]:
# There might be some text before/in the list of LVAs that we just
# skip.
line = next_line(lines, strip=False)
else:
state = State.MODUL_NAME
elif state == State.LEHRVERANSTALTUNGSTYPEN:
# A lot of text inbetween is skipped.
if "Semestereinteilung der Lehrveranstaltungen" in line:
# Can be appendix D or C.
state = State.SEMESTEREINTEILUNG
studienplan["semestereinteilung"] = {}
semestereinteilung = studienplan["semestereinteilung"]
line = next_line(lines)
elif state == State.SEMESTEREINTEILUNG:
if line.endswith("Semester (WS)") or line.endswith("Semester (SS)"):
state = State.SEMESTEREINTEILUNG_SEMESTER
else:
line = next_line(lines)
elif state == State.SEMESTEREINTEILUNG_SEMESTER:
semestereinteilung[line] = []
semester = semestereinteilung[line]
state = State.SEMESTEREINTEILUNG_LVA
line = next_line(lines)
elif state == State.SEMESTEREINTEILUNG_LVA:
if line.endswith("Semester (WS)") or line.endswith("Semester (SS)"):
state = State.SEMESTEREINTEILUNG_SEMESTER
elif line.startswith("E. Semesterempfehlung"):
# Bachelor
state = State.SEMESTEREMPFEHLUNG_SCHIEFEINSTEIGEND
elif line.startswith("D. Prüfungsfächer mit den zugeordneten Modulen"):
# Master
state = State.PRUEFUNGSFAECHER
else:
match = re.match(
r"(?P<not_steop_constrained>\*)?\s*(?P<ects>\d{1,2},\d)\s*"
+ r"(?P<lva_typ>[A-Z]+)\s+(?P<name>.*)",
line,
)
if match:
lva = match.groupdict()
lva["not_steop_constrained"] = (
lva["not_steop_constrained"] != "*"
)
semester.append(lva)
line = next_line(lines)
elif state == State.SEMESTEREMPFEHLUNG_SCHIEFEINSTEIGEND:
# A lot of text inbetween is skipped.
if "Prüfungsfächer mit den zugeordneten Modulen" in line:
# Can be appendix D or G, depending on Bachelor or Master.
state = State.PRUEFUNGSFAECHER
line = next_line(lines)
elif state == State.PRUEFUNGSFAECHER:
if line.startswith("Prüfungsfach"):
studienplan["pruefungsfaecher"] = []
pruefungsfaecher = studienplan["pruefungsfaecher"]
state = State.PRUEFUNGSFACH_NAME
else:
line = next_line(lines)
elif state == State.PRUEFUNGSFACH_NAME:
if line.startswith("Prüfungsfach"):
pruefungsfach = {"name": line, "module": []}
pruefungsfaecher.append(pruefungsfach)
line = next_line(lines)
elif line.startswith("Modul") or line.startswith("*Modul"):
pruefungsfach["name"] = re.match(
r'Prüfungsfach "([^"]+)"', pruefungsfach["name"]
).group(1)
state = State.PRUEFUNGSFACH_MODUL
elif line.startswith("H. Bachelor-Abschluss mit Honors"):
state = State.END
elif pruefungsfach["name"] == 'Prüfungsfach "Diplomarbeit"':
# Special case for Diplomarbeit which doesn't have a Modul.
pruefungsfach["name"] = "Diplomarbeit"
state = State.END
else:
# Continuing Prüfungsfach name
pruefungsfach["name"] += " " + line
line = next_line(lines)
elif state == State.PRUEFUNGSFACH_MODUL:
# The fixing of quotes ist not 100% perfect so we don't rely on the fact
# that the name of the Modul is within quotes. We parse the name with
# quotes.
modul = re.match(
r"(?P<wahl>\*)?Modul "
+ r"(?:(?P<name>.+)\s+\((?P<ects>.*) ECTS\)|(?P<name_no_ects>.+))",
line,
).groupdict()
name_no_ects = modul.pop("name_no_ects")
if name_no_ects:
modul["name"] = name_no_ects
# And remove the quotes here.
modul["name"] = modul["name"].replace('"', "")
modul["wahl"] = modul["wahl"] == "*"
pruefungsfach["module"].append(modul)
state = State.PRUEFUNGSFACH_MODUL_LVA
line = next_line(lines)
elif state == State.PRUEFUNGSFACH_MODUL_LVA:
if line.startswith("Modul") or line.startswith("*Modul"):
state = State.PRUEFUNGSFACH_MODUL
elif line.startswith("Prüfungsfach"):
state = State.PRUEFUNGSFACH_NAME
else:
# TODO Skip list of LVAs for now.
line = next_line(lines)
elif state == State.END:
break
except StopIteration:
pass
return studienplan
def read_pdf(filename):
result = subprocess.run(
[
"pdftotext",
"-nopgbrk",
"-layout",
"-x",
"72",
"-y",
"72",
"-W",
"460",
"-H",
"650",
filename,
"-",
],
encoding="utf8",
capture_output=True,
)
return result.stdout
def dehyphenate(text):
while "-\n" in text:
text = re.sub("-\n\\s*", "", text)
return text
def fix_quotes(text):
text = text.replace("“", '"')
fixed_text = []
prev_line = None
for line in text.splitlines():
while "”" in line:
i_quote = line.index("”")
line = line.replace("”", " ", 1)
assert prev_line is not None
if len(prev_line) <= i_quote:
i_quote = len(prev_line) - 1
if prev_line[i_quote] == " ":
i_word = i_quote + 1
else:
i_word = prev_line.rindex(" ", 0, i_quote) + 1
# XXX what if quote is at the beginning of the line
prev_line = prev_line[:i_word] + '"' + prev_line[i_word:]
fixed_text.append(prev_line)
prev_line = line
return "\n".join(fixed_text[1:])
def remove_footnotes(text):
fixed_text = []
in_footnote = False
for line in text.splitlines():
if re.match(r"^ \d$", line):
in_footnote = True
continue
if in_footnote and line.startswith(" "):
continue
in_footnote = False
fixed_text.append(line)
return "\n".join(fixed_text)
def cleanup_text(text):
text = fix_quotes(text)
text = dehyphenate(text)
text = remove_footnotes(text)
return text
def condense_studienplan(studienplan):
def _get_modulbeschreibung(modul_name):
for i, modulbeschreibung in enumerate(studienplan["modulbeschreibungen"]):
if modulbeschreibung["name"] == modul_name:
del studienplan["modulbeschreibungen"][i]
return modulbeschreibung
raise ValueError(f"Modulbeschreibung for {modul_name} not found!")
def _get_semester_steop(lva):
for semester, lvas in studienplan["semestereinteilung"].items():
for i, l in enumerate(lvas):
if (
lva["name"] == l["name"]
and lva["lva_typ"] == l["lva_typ"]
and lva["ects"] == l["ects"]
):
del lvas[i]
return semester, l["not_steop_constrained"]
return None, False
for pruefungsfach in studienplan["pruefungsfaecher"]:
for modul in pruefungsfach["module"]:
try:
modulbeschreibung = _get_modulbeschreibung(modul["name"])
except ValueError as e:
if modul["name"].startswith("Projekt aus "):
# The Modul "Projekt aus Software Engineering & Projektmanagement"
# is part of every Prüfungsfach. However, it's deleted from the
# Modulbeschreibung after beeing assigned to the first Prüfungsfach.
# That's OK.
continue
raise e
assert modulbeschreibung["regelarbeitsaufwand"]["ects"] == modul["ects"]
modul["lernergebnisse"] = modulbeschreibung["lernergebnisse"]
modul["lvas"] = modulbeschreibung["lvas"]
for lva in modul["lvas"]:
lva["semester"], lva["not_steop_constrained"] = _get_semester_steop(lva)
# Delete redundant information and make sure that it has been used.
assert studienplan["modulbeschreibungen"] == []
del studienplan["modulbeschreibungen"]
for semestereinteilung in studienplan["semestereinteilung"].values():
assert semestereinteilung == []
del studienplan["semestereinteilung"]
# Sort.
studienplan["pruefungsfaecher"] = sorted(
studienplan["pruefungsfaecher"], key=lambda p: p["name"]
)
for pruefungsfach in studienplan["pruefungsfaecher"]:
pruefungsfach["module"] = sorted(
pruefungsfach["module"], key=lambda m: m["name"]
)
def main():
text = cleanup_text(read_pdf(sys.argv[1]))
studienplan = parse_studienplan(text)
condense_studienplan(studienplan)
with open(sys.argv[1].replace("pdf", "json"), "w") as f:
json.dump(studienplan["pruefungsfaecher"], f, indent=4, sort_keys=True)
if __name__ == "__main__":
main()