-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate.py
122 lines (95 loc) · 3.25 KB
/
calculate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import json
import pandas as pd
import yaml
with open("data_out/data.json") as f:
data = json.load(f)
print(len(data))
print(
len([d for d in data if "opening_hours" in d and "periods" in d["opening_hours"]])
)
def parse_hour(time):
assert len(time) == 4
h = int(time[:2])
min = int(time[2:])
return h + min / 60
def parse_period(period):
open = period["open"]["day"] * 24 + parse_hour(period["open"]["time"])
close = period["close"]["day"] * 24 + parse_hour(period["close"]["time"])
week = 24 * 7
return ((close - open) % week) / week
def calc_times(entry):
if not "opening_hours" in entry:
return None
oh = entry["opening_hours"]
if not "periods" in oh:
return None
periods = oh["periods"]
always_open_convention = [{"open": {"day": 0, "time": "0000"}}]
if periods == always_open_convention:
return 1
return sum([parse_period(period) for period in periods])
ranking = []
unis = set(d["uni"] for d in data)
for uni in unis:
libraries = [
{
"uni": uni,
"library": d["name"],
"gmaps_rating_original": d["rating"] if "rating" in d else None,
"gmaps_rating_count": d["user_ratings_total"]
if "user_ratings_total" in d
else None,
"opening_hours": round(calc_times(d) * 100) / 100,
"opening_hours_percent": str(round(calc_times(d) * 100)) + "%",
}
for d in data
if d["uni"] == uni and calc_times(d) is not None
]
if len(libraries) > 0:
lib = max(libraries, key=lambda x: x["opening_hours"])
ranking.append(lib)
# library open time ranking
df = pd.DataFrame(ranking)
df["rank"] = df["opening_hours"].rank(ascending=False, method="min")
df["rank"] = df["rank"].apply(round)
cols = df.columns.to_list()
df = df[[cols[-1], *cols[:-1]]]
# add qs ranks
def get_rank(uni):
rank = qs[qs["Institution Name"] == uni].iloc[0]["RANK"]
if isinstance(rank, int):
return str(rank)
if isinstance(rank, float):
if pd.isna(rank):
return "?"
rank = int(rank)
if isinstance(rank, str):
rank = rank.strip(" =+")
if "-" in rank:
rank = rank.split("-")[0]
return str(int(rank))
qs = pd.read_excel("data_in/2023 QS World University Rankings V2.1.xlsx")
df["qs_rank"] = df["uni"].apply(get_rank)
df["uni"] = df["uni"].apply(lambda s: s.strip())
# bayesian average of gmaps ratings
def bayesian_avg(row):
c = df["gmaps_rating_count"].quantile(0.25)
prior = df["gmaps_rating_original"].mean()
avg = (row["gmaps_rating_original"] * row["gmaps_rating_count"] + prior * c) / (
row["gmaps_rating_count"] + c
)
if pd.isna(avg):
return None
return avg
def stretch(x):
if pd.isna(x):
return x
min = df["gmaps_rating"].min()
max = df["gmaps_rating"].max()
return round(((x - min) / (max - min) * 4 + 1) * 10) / 10
df["gmaps_rating"] = df.apply(bayesian_avg, axis=1)
df["gmaps_rating"] = df["gmaps_rating"].apply(stretch)
df = df.sort_values(by=["opening_hours", "gmaps_rating"], ascending=[False, False])
df.to_csv("data_out/ranking.csv", index=False)
with open("data_out/ranking.yml", "w") as f:
yaml.dump(df.to_dict(orient="records"), f)