Skip to content

Commit 1a32856

Browse files
committed
Added ability to scrape the players and teams' unique IDs assigned by FBref
1 parent ef6e7d3 commit 1a32856

File tree

2 files changed

+65
-27
lines changed

2 files changed

+65
-27
lines changed

fbrefdata/fbref.py

Lines changed: 64 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Scraper for http://fbref.com."""
2+
23
import warnings
34
from datetime import date, datetime
45
from functools import reduce
@@ -336,8 +337,7 @@ def read_team_season_stats( # noqa: C901
336337
.rename(columns={"Squad": "team", "# Pl": "players_used"})
337338
.replace({"team": get_team_replacements()})
338339
# .pipe(standardize_colnames)
339-
.set_index(["league", "season", "team"])
340-
.sort_index()
340+
.set_index(["id"])
341341
)
342342
return df
343343

@@ -432,7 +432,8 @@ def read_team_match_stats( # noqa: C901
432432

433433
# collect match logs for each team
434434
stats = []
435-
for (_, skey, team), team_url in iterator.url.items():
435+
iterator.set_index(["season", "team"], append=True, inplace=True)
436+
for (id, skey, team), team_url in iterator.url.items():
436437
# read html page
437438
filepath = self.data_dir / filemask.format(team, skey, stat_type)
438439
if len(team_url.split('/')) == 6: # already have season in the url
@@ -473,13 +474,16 @@ def read_team_match_stats( # noqa: C901
473474
df_table = _parse_table(html_table)
474475
df_table["season"] = skey
475476
df_table["team"] = team
477+
df_table["id"] = id
476478
df_table["Time"] = [
477479
x.get('csk', None) for x in html_table.xpath(".//td[@data-stat='start_time']")
478480
]
479481
df_table["Match Report"] = [
480-
mlink.xpath("./a/@href")[0]
481-
if mlink.xpath("./a") and mlink.xpath("./a")[0].text == "Match Report"
482-
else None
482+
(
483+
mlink.xpath("./a/@href")[0]
484+
if mlink.xpath("./a") and mlink.xpath("./a")[0].text == "Match Report"
485+
else None
486+
)
483487
for mlink in html_table.xpath(".//td[@data-stat='match_report']")
484488
]
485489
nb_levels = df_table.columns.nlevels
@@ -527,7 +531,7 @@ def read_team_match_stats( # noqa: C901
527531
lambda x: x["team"] if x["venue"] == "Away" else x["opponent"], axis=1
528532
)
529533
df["game"] = df_tmp.apply(make_game_id, axis=1)
530-
return df.set_index(["league", "season", "team", "game"]).sort_index().loc[self.leagues]
534+
return df.set_index(["league"]).sort_index().loc[self.leagues]
531535

532536
def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: # noqa: C901
533537
"""Retrieve players from the datasource for the selected leagues and seasons.
@@ -614,7 +618,7 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
614618
for elem in tree.xpath("//td[@data-stat='comp_level']//span"):
615619
elem.getparent().remove(elem)
616620
if big_five:
617-
df_table = _parse_table(tree)
621+
df_table = _parse_table(tree, player_table=True)
618622
df_table[("Unnamed: league", "league")] = (
619623
df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT)
620624
)
@@ -647,8 +651,7 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
647651
.rename(columns={"Squad": "team"})
648652
.replace({"team": get_team_replacements()})
649653
.pipe(standardize_colnames, cols=["Player", "Nation", "Pos", "Age", "Born"])
650-
.set_index(["league", "season", "team", "player"])
651-
.sort_index()
654+
.set_index(["id"])
652655
)
653656

654657
return df
@@ -679,19 +682,26 @@ def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
679682
if type(data) is not pd.DataFrame:
680683
tree = html.parse(data)
681684

682-
url_fixtures = FBREF_API + tree.xpath("//a[text()='Scores & Fixtures']")[0].get("href")
685+
url_fixtures = FBREF_API + tree.xpath("//a[text()='Scores & Fixtures']")[0].get(
686+
"href"
687+
)
683688
filepath_fixtures = self.data_dir / f"schedule_{lkey}_{skey}.csv"
684689
current_season = not self._is_complete(lkey, skey)
685690
data_fixtures = self.get(
686-
url_fixtures, filepath_fixtures, no_cache=current_season and not force_cache, header=[0, 1]
691+
url_fixtures,
692+
filepath_fixtures,
693+
no_cache=current_season and not force_cache,
694+
header=[0, 1],
687695
)
688696
tree = html.parse(data_fixtures)
689697
html_table = tree.xpath("//table[contains(@id, 'sched')]")[0]
690698
df_table = _parse_table(html_table)
691699
df_table["Match Report"] = [
692-
mlink.xpath("./a/@href")[0]
693-
if mlink.xpath("./a") and mlink.xpath("./a")[0].text == "Match Report"
694-
else None
700+
(
701+
mlink.xpath("./a/@href")[0]
702+
if mlink.xpath("./a") and mlink.xpath("./a")[0].text == "Match Report"
703+
else None
704+
)
695705
for mlink in html_table.xpath(".//td[@data-stat='match_report']")
696706
]
697707
df_table["league"] = lkey
@@ -841,7 +851,9 @@ def read_player_match_stats(
841851
df_table["game_id"] = game["game_id"]
842852
dfs.append(df_table)
843853
else:
844-
logger.warning("No stats found for home team for game with id=%s", game["game_id"])
854+
logger.warning(
855+
"No stats found for home team for game with id=%s", game["game_id"]
856+
)
845857
html_table = tree.find("//table[@id='" + id_format.format(away_team["id"]) + "']")
846858
if html_table is not None:
847859
df_table = _parse_table(html_table)
@@ -852,7 +864,9 @@ def read_player_match_stats(
852864
df_table["game_id"] = game["game_id"]
853865
dfs.append(df_table)
854866
else:
855-
logger.warning("No stats found for away team for game with id=%s", game["game_id"])
867+
logger.warning(
868+
"No stats found for away team for game with id=%s", game["game_id"]
869+
)
856870
df = _concat(dfs, key=['game'])
857871
self.save(df, filepath)
858872
else:
@@ -865,8 +879,7 @@ def read_player_match_stats(
865879
df.rename(columns={"#": "jersey_number"})
866880
.replace({"team": get_team_replacements()})
867881
.pipe(standardize_colnames, cols=["Player", "Nation", "Pos", "Age", "Min"])
868-
.set_index(["league", "season", "game", "team", "player"])
869-
.sort_index()
882+
.set_index(["id"])
870883
)
871884
return df
872885

@@ -941,9 +954,18 @@ def read_lineup(
941954
"//table[@id='" + "stats_{}_summary".format(teams[i]["id"]) + "']"
942955
)
943956
df_stats_table = _parse_table(html_stats_table)
944-
df_stats_table = df_stats_table.droplevel(0, axis=1)[["Player", "#", "Pos", "Min"]]
945-
df_stats_table.columns = ["player", "jersey_number", "position", "minutes_played"]
946-
df_stats_table["jersey_number"] = df_stats_table["jersey_number"].astype("Int64")
957+
df_stats_table = df_stats_table.droplevel(0, axis=1)[
958+
["Player", "#", "Pos", "Min"]
959+
]
960+
df_stats_table.columns = [
961+
"player",
962+
"jersey_number",
963+
"position",
964+
"minutes_played",
965+
]
966+
df_stats_table["jersey_number"] = df_stats_table["jersey_number"].astype(
967+
"Int64"
968+
)
947969
df_table["jersey_number"] = df_table["jersey_number"].astype("Int64")
948970
df_table = pd.merge(
949971
df_table, df_stats_table, on=["player", "jersey_number"], how="left"
@@ -1013,7 +1035,9 @@ def read_events(
10131035
tree = html.parse(data)
10141036
teams = self._parse_teams(tree)
10151037
for team, tid in zip(teams, ["a", "b"]):
1016-
html_events = tree.xpath(f"////*[@id='events_wrap']/div/div[@class='event {tid}']")
1038+
html_events = tree.xpath(
1039+
f"////*[@id='events_wrap']/div/div[@class='event {tid}']"
1040+
)
10171041
for e in html_events:
10181042
minute = e.xpath("./div[1]")[0].text.replace("’", "").strip()
10191043
score = e.xpath("./div[1]/small/span")[0].text
@@ -1082,7 +1106,7 @@ def read_shot_events(
10821106
pd.DataFrame.
10831107
"""
10841108
urlmask = FBREF_API + "/en/matches/{}"
1085-
filemask = "match_{}.csv"
1109+
filemask = "shots_{}.csv"
10861110

10871111
# Retrieve games for which a match report is available
10881112
df_schedule = self.read_schedule(force_cache).reset_index()
@@ -1132,8 +1156,7 @@ def read_shot_events(
11321156
standardize_colnames,
11331157
cols=["Outcome", "Minute", "Distance", "Player", "Body Part", "Notes", "Event"],
11341158
)
1135-
.set_index(["league", "season", "game"])
1136-
.sort_index()
1159+
.set_index(["id"])
11371160
.dropna(how="all")
11381161
)
11391162
return df
@@ -1160,8 +1183,23 @@ def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame:
11601183
# remove thead rows in the table body
11611184
for elem in html_table.xpath("//tbody/tr[contains(@class, 'thead')]"):
11621185
elem.getparent().remove(elem)
1186+
# remove tfoot rows in the table body
1187+
for elem in html_table.xpath("//tfoot"):
1188+
elem.getparent().remove(elem)
1189+
# override ranking with a unique id for each player to use as index
1190+
ids = []
1191+
for elem in html_table.xpath(
1192+
"tbody/tr/td[@data-stat='player']/a"
1193+
" | tbody/tr/th[@data-stat='player']/a"
1194+
" | tbody/tr/td[@data-stat='team']/a"
1195+
" | tbody/tr/th[@data-stat='team']/a"
1196+
):
1197+
id = elem.attrib["href"].split("/")[3]
1198+
ids.append(id)
11631199
# parse HTML to dataframe
11641200
(df_table,) = pd.read_html(html.tostring(html_table), flavor="lxml")
1201+
if len(ids) > 0:
1202+
df_table.insert(0, "id", ids)
11651203
return df_table.convert_dtypes()
11661204

11671205

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "fbrefdata"
3-
version = "0.1.0"
3+
version = "0.1.2"
44
description = "A scraper of soccer data from FBref."
55
authors = ["Lorenzo De Bernardini <lorenzo@envs.net>"]
66
license = "Apache-2.0"

0 commit comments

Comments
 (0)