-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
234 lines (202 loc) · 6.64 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 17 20:34:41 2024
@author: djmcd
"""
import requests
from bs4 import BeautifulSoup
def readCBBR(team: str, year: str):
"""Scrapes the basketball reference page for a given team and year. Returns dict of stats."""
link = f'https://www.sports-reference.com/cbb/schools/{team}/men/{year}.html'
url = requests.get(link)
soup = BeautifulSoup(url.text, "html.parser")
try:
data = soup.find("div", attrs={"id": "switcher_per_game_team"}).find("table").find("tbody").find_all("tr")
stats = data[0].find_all("td")
ranks = data[1].find_all("td")
opp_stats = data[2].find_all("td")
opp_ranks = data[3].find_all("td")
except:
return 1
#Field Goals
fg = stats[2].text
fgr = ranks[2].text
ofg = opp_stats[2].text
ofgr = opp_ranks[2].text
#Field Goal Pct
fgp = stats[4].text
fgpr = ranks[4].text
ofgp = opp_stats[4].text
ofgpr = opp_ranks[4].text
#Three Point Field Goals
tp = stats[8].text
tpr = ranks[8].text
otp = opp_stats[8].text
otpr = opp_ranks[8].text
#Three Point Pct
tpp = stats[10].text
tppr = ranks[10].text
otpp = opp_stats[10].text
otppr = opp_ranks[10].text
#Free Throws
ft = stats[11].text
ftr = ranks[11].text
oft = opp_stats[11].text
oftr = opp_ranks[11].text
#Free Throw Pct
ftp = stats[13].text
ftpr = ranks[13].text
oftp = opp_stats[13].text
oftpr = opp_ranks[13].text
#Total Rebounds
tr = stats[16].text
trr = ranks[16].text
otr = opp_stats[16].text
otrr = opp_ranks[16].text
#Turnovers
to = stats[20].text
tor = ranks[20].text
oto = opp_stats[20].text
otor = opp_ranks[20].text
#Points
pt = stats[22].text
ptr = ranks[22].text
opt = opp_stats[22].text
optr = opp_ranks[22].text
res = {"FG": fg,
"FG Rk": fgr,
"Opp FG": ofg,
"Opp FG Rk": ofgr,
"FG%": fgp,
"FG% Rk": fgpr,
"Opp FG%": ofgp,
"Opp FG% Rk": ofgpr,
"3P": tp,
"3P Rk": tpr,
"Opp 3P": otp,
"Opp 3P Rk": otpr,
"3P%": tpp,
"3P% Rk": tppr,
"Opp 3P%": otpp,
"Opp 3P% Rk": otppr,
"FT": ft,
"FT Rk": ftr,
"Opp FT": oft,
"Opp FT Rk": oftr,
"FT%": ftp,
"FT% Rk": ftpr,
"Opp FT%": oftp,
"Opp FT% Rk": oftpr,
"TR": tr,
"TR Rk": trr,
"Opp TR": otr,
"Opp TR Rk": otrr,
"TO": to,
"TO Rk": tor,
"Opp TO": oto,
"Opp TO Rk": otor,
"Pts": pt,
"Pts Rk": ptr,
"Opp Pts": opt,
"Opp Pts Rk": optr}
return res
def readGameLog(team: str, year: str):
"""Scrapes the basketball reference game log for a given team and year. Returns list of game dicts."""
link = f'https://www.sports-reference.com/cbb/schools/{team}/men/{year}-gamelogs.html'
url = requests.get(link)
soup = BeautifulSoup(url.text, "html.parser")
try:
gamelog = soup.find("div", attrs={"id": "div_sgl-basic_NCAAM"}).find("table").find("tbody").find_all("tr")
except:
print("Error fetching games")
return 1
games = []
for gm in gamelog:
#Continue over empty rows
if (gm.attrs=={"class": ["over_header", "thead"]} or gm.attrs=={"class": ["thead"]}):
continue
cols = gm.find_all("td")
loc = cols[1].text
opp = cols[2].text
pt = cols[4].text
opt = cols[5].text
fgp = cols[8].text
ofgp = cols[25].text
tpp = cols[11].text
otpp = cols[28].text
tr = cols[16].text
otr = cols[33].text
to = cols[20].text
oto = cols[37].text
if (pt == ""):
continue
if (loc == ""):
loc = "H"
game = {"Opp": opp,
"Loc": loc,
"Pts": pt,
"Opp Pts": opt,
"Net Pts": int(pt) - int(opt),
"FG%": fgp,
"Opp FG%": ofgp,
"3P%": tpp,
"Opp 3P%": otpp,
"TR": tr,
"Opp TR": otr,
"TO": to,
"Opp TO": oto}
games.append(game)
return games
def readKenPom(team: str, year: str):
"""Scrapes kenpom rankings for a given team and year. Returns dict of metrics."""
link = f'https://kenpom.com/index.php?y={year}'
url = requests.get(link, headers = {
'User-Agent': 'Popular browser\'s user-agent',
})
soup = BeautifulSoup(url.text, "html.parser")
#Have to double loop here because every 40 teams is a new tbody
tables = soup.find("div", attrs={"id": "table-wrapper"}).find("table").find_all("tbody")
for table in tables:
rows = table.find_all("tr")
for row in rows:
cols = row.find_all("td")
try:
cols[1]
except:
continue
#Since this is for tournament teams, have to get rid of their ranking, hence [:-2]
if (cols[1].text.lower()[:-2] == team.lower() or cols[1].text.lower()[:-3] == team.lower()):
rk = cols[0].text
em = cols[4].text
o = cols[5].text
ork = cols[6].text
d = cols[7].text
drk = cols[8].text
t = cols[9].text
trk = cols[10].text
l = cols[11].text
lrk = cols[12].text
sos = cols[13].text
sosrk = cols[14].text
oppo = cols[15].text
oppork = cols[16].text
oppd = cols[17].text
oppdrk = cols[18].text
data = {"EM": em,
"Rk": rk,
"Off": o,
"Off Rk": ork,
"Def": d,
"Def Rk": drk,
"Temp": t,
"Temp Rk": trk,
"Luck": l,
"Luck Rk": lrk,
"Opp EM": sos,
"Opp EM Rk": sosrk,
"Opp Off": oppo,
"Opp Off Rk": oppork,
"Opp Def": oppd,
"Opp Def Rk": oppdrk}
return data
return 1