-
Notifications
You must be signed in to change notification settings - Fork 2
/
TableParser.py
97 lines (79 loc) · 3.58 KB
/
TableParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
from bs4 import BeautifulSoup
import pyodbc
import re
from datetime import datetime, timedelta
import pytz
class TableParser:
TIMEZONE = 'Europe/Madrid'
EMPTY_CELL_CLASSES = ['row_labels', 'even_row', 'odd_row']
def __init__(self, baseUrl):
self.baseUrl = baseUrl
def parse(self, year, month, day, area, db = None):
url = self.baseUrl
params = {
'year': year,
'month': month,
'day': day,
'area': area
}
page = requests.get(url, params=params)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find(id="day_main")
hores = []
implicitClasses = []
for h in range(8,22):
newhour = ""
if h < 10:
newhour += "0"
newhour += str(h)
hores.append(newhour + ":00")
implicitClasses.append([])
hores.append(newhour + ":30")
implicitClasses.append([])
p = re.compile(r"Aula (\S+) ?\(\d*\)", re.IGNORECASE)
horaActual = 0
for hora in hores:
td_hora = table.find(text=hora).findNext('td')
column = 1
while hora not in td_hora.get_text():
while column in implicitClasses[horaActual]:
column += 1
classes = td_hora['class'];
if td_hora.has_attr('class') and not td_hora['class'][0] in self.EMPTY_CELL_CLASSES:
assignatura = td_hora.get_text().strip()
degree = td_hora.get("class")[0]
aulaRaw = table.find_all("th")[column].get_text().strip()
aula = p.match(aulaRaw).group(1)
files = int(td_hora.get("rowspan"))
durada = files*30
timeSplit = hora.split(':')
beginsDateTime = datetime(year, month, day, int(timeSplit[0]), int(timeSplit[1]))
beginsDateTime = pytz.timezone(self.TIMEZONE).localize(beginsDateTime)
begins = int(beginsDateTime.timestamp())
endsDateTime = beginsDateTime + timedelta(minutes=durada)
ends = int(endsDateTime.timestamp())
print(("Afegint " if db != None else "") + assignatura
+ ", " + hora
+ ", " + str(durada) + "mins"
+ ", " + aula
+ ", " + degree)
if db != None:
cursor1 = db.cursor()
cursor1.execute("SELECT id FROM classes WHERE calendar_name = ? AND room = ? AND begins = ? AND ends = ? AND degree = ?",
assignatura, aula, begins, ends, degree)
row = cursor1.fetchone()
if row:
print("[WARNING] Ja estava a la DB (id " + str(row.id) + ")")
else:
cursor2 = db.cursor()
cursor2.execute("INSERT INTO classes (calendar_name, room, begins, ends, degree) VALUES (?, ?, ?, ?, ?)",
assignatura, aula, begins, ends, degree)
for i in range(1, files - 1):
if i < len(implicitClasses):
implicitClasses[horaActual + i].append(column)
td_hora = td_hora.findNext('td')
column = column + 1
horaActual = horaActual + 1
if db != None:
db.commit()