1
1
"""Scraper for http://fbref.com."""
2
+
2
3
import warnings
3
4
from datetime import date , datetime
4
5
from functools import reduce
@@ -336,8 +337,7 @@ def read_team_season_stats( # noqa: C901
336
337
.rename (columns = {"Squad" : "team" , "# Pl" : "players_used" })
337
338
.replace ({"team" : get_team_replacements ()})
338
339
# .pipe(standardize_colnames)
339
- .set_index (["league" , "season" , "team" ])
340
- .sort_index ()
340
+ .set_index (["id" ])
341
341
)
342
342
return df
343
343
@@ -432,7 +432,8 @@ def read_team_match_stats( # noqa: C901
432
432
433
433
# collect match logs for each team
434
434
stats = []
435
- for (_ , skey , team ), team_url in iterator .url .items ():
435
+ iterator .set_index (["season" , "team" ], append = True , inplace = True )
436
+ for (id , skey , team ), team_url in iterator .url .items ():
436
437
# read html page
437
438
filepath = self .data_dir / filemask .format (team , skey , stat_type )
438
439
if len (team_url .split ('/' )) == 6 : # already have season in the url
@@ -473,13 +474,16 @@ def read_team_match_stats( # noqa: C901
473
474
df_table = _parse_table (html_table )
474
475
df_table ["season" ] = skey
475
476
df_table ["team" ] = team
477
+ df_table ["id" ] = id
476
478
df_table ["Time" ] = [
477
479
x .get ('csk' , None ) for x in html_table .xpath (".//td[@data-stat='start_time']" )
478
480
]
479
481
df_table ["Match Report" ] = [
480
- mlink .xpath ("./a/@href" )[0 ]
481
- if mlink .xpath ("./a" ) and mlink .xpath ("./a" )[0 ].text == "Match Report"
482
- else None
482
+ (
483
+ mlink .xpath ("./a/@href" )[0 ]
484
+ if mlink .xpath ("./a" ) and mlink .xpath ("./a" )[0 ].text == "Match Report"
485
+ else None
486
+ )
483
487
for mlink in html_table .xpath (".//td[@data-stat='match_report']" )
484
488
]
485
489
nb_levels = df_table .columns .nlevels
@@ -527,7 +531,7 @@ def read_team_match_stats( # noqa: C901
527
531
lambda x : x ["team" ] if x ["venue" ] == "Away" else x ["opponent" ], axis = 1
528
532
)
529
533
df ["game" ] = df_tmp .apply (make_game_id , axis = 1 )
530
- return df .set_index (["league" , "season" , "team" , "game" ]).sort_index ().loc [self .leagues ]
534
+ return df .set_index (["league" ]).sort_index ().loc [self .leagues ]
531
535
532
536
def read_player_season_stats (self , stat_type : str = "standard" ) -> pd .DataFrame : # noqa: C901
533
537
"""Retrieve players from the datasource for the selected leagues and seasons.
@@ -614,7 +618,7 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
614
618
for elem in tree .xpath ("//td[@data-stat='comp_level']//span" ):
615
619
elem .getparent ().remove (elem )
616
620
if big_five :
617
- df_table = _parse_table (tree )
621
+ df_table = _parse_table (tree , player_table = True )
618
622
df_table [("Unnamed: league" , "league" )] = (
619
623
df_table .xs ("Comp" , axis = 1 , level = 1 ).squeeze ().map (BIG_FIVE_DICT )
620
624
)
@@ -647,8 +651,7 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
647
651
.rename (columns = {"Squad" : "team" })
648
652
.replace ({"team" : get_team_replacements ()})
649
653
.pipe (standardize_colnames , cols = ["Player" , "Nation" , "Pos" , "Age" , "Born" ])
650
- .set_index (["league" , "season" , "team" , "player" ])
651
- .sort_index ()
654
+ .set_index (["id" ])
652
655
)
653
656
654
657
return df
@@ -679,19 +682,26 @@ def read_schedule(self, force_cache: bool = False) -> pd.DataFrame:
679
682
if type (data ) is not pd .DataFrame :
680
683
tree = html .parse (data )
681
684
682
- url_fixtures = FBREF_API + tree .xpath ("//a[text()='Scores & Fixtures']" )[0 ].get ("href" )
685
+ url_fixtures = FBREF_API + tree .xpath ("//a[text()='Scores & Fixtures']" )[0 ].get (
686
+ "href"
687
+ )
683
688
filepath_fixtures = self .data_dir / f"schedule_{ lkey } _{ skey } .csv"
684
689
current_season = not self ._is_complete (lkey , skey )
685
690
data_fixtures = self .get (
686
- url_fixtures , filepath_fixtures , no_cache = current_season and not force_cache , header = [0 , 1 ]
691
+ url_fixtures ,
692
+ filepath_fixtures ,
693
+ no_cache = current_season and not force_cache ,
694
+ header = [0 , 1 ],
687
695
)
688
696
tree = html .parse (data_fixtures )
689
697
html_table = tree .xpath ("//table[contains(@id, 'sched')]" )[0 ]
690
698
df_table = _parse_table (html_table )
691
699
df_table ["Match Report" ] = [
692
- mlink .xpath ("./a/@href" )[0 ]
693
- if mlink .xpath ("./a" ) and mlink .xpath ("./a" )[0 ].text == "Match Report"
694
- else None
700
+ (
701
+ mlink .xpath ("./a/@href" )[0 ]
702
+ if mlink .xpath ("./a" ) and mlink .xpath ("./a" )[0 ].text == "Match Report"
703
+ else None
704
+ )
695
705
for mlink in html_table .xpath (".//td[@data-stat='match_report']" )
696
706
]
697
707
df_table ["league" ] = lkey
@@ -841,7 +851,9 @@ def read_player_match_stats(
841
851
df_table ["game_id" ] = game ["game_id" ]
842
852
dfs .append (df_table )
843
853
else :
844
- logger .warning ("No stats found for home team for game with id=%s" , game ["game_id" ])
854
+ logger .warning (
855
+ "No stats found for home team for game with id=%s" , game ["game_id" ]
856
+ )
845
857
html_table = tree .find ("//table[@id='" + id_format .format (away_team ["id" ]) + "']" )
846
858
if html_table is not None :
847
859
df_table = _parse_table (html_table )
@@ -852,7 +864,9 @@ def read_player_match_stats(
852
864
df_table ["game_id" ] = game ["game_id" ]
853
865
dfs .append (df_table )
854
866
else :
855
- logger .warning ("No stats found for away team for game with id=%s" , game ["game_id" ])
867
+ logger .warning (
868
+ "No stats found for away team for game with id=%s" , game ["game_id" ]
869
+ )
856
870
df = _concat (dfs , key = ['game' ])
857
871
self .save (df , filepath )
858
872
else :
@@ -865,8 +879,7 @@ def read_player_match_stats(
865
879
df .rename (columns = {"#" : "jersey_number" })
866
880
.replace ({"team" : get_team_replacements ()})
867
881
.pipe (standardize_colnames , cols = ["Player" , "Nation" , "Pos" , "Age" , "Min" ])
868
- .set_index (["league" , "season" , "game" , "team" , "player" ])
869
- .sort_index ()
882
+ .set_index (["id" ])
870
883
)
871
884
return df
872
885
@@ -941,9 +954,18 @@ def read_lineup(
941
954
"//table[@id='" + "stats_{}_summary" .format (teams [i ]["id" ]) + "']"
942
955
)
943
956
df_stats_table = _parse_table (html_stats_table )
944
- df_stats_table = df_stats_table .droplevel (0 , axis = 1 )[["Player" , "#" , "Pos" , "Min" ]]
945
- df_stats_table .columns = ["player" , "jersey_number" , "position" , "minutes_played" ]
946
- df_stats_table ["jersey_number" ] = df_stats_table ["jersey_number" ].astype ("Int64" )
957
+ df_stats_table = df_stats_table .droplevel (0 , axis = 1 )[
958
+ ["Player" , "#" , "Pos" , "Min" ]
959
+ ]
960
+ df_stats_table .columns = [
961
+ "player" ,
962
+ "jersey_number" ,
963
+ "position" ,
964
+ "minutes_played" ,
965
+ ]
966
+ df_stats_table ["jersey_number" ] = df_stats_table ["jersey_number" ].astype (
967
+ "Int64"
968
+ )
947
969
df_table ["jersey_number" ] = df_table ["jersey_number" ].astype ("Int64" )
948
970
df_table = pd .merge (
949
971
df_table , df_stats_table , on = ["player" , "jersey_number" ], how = "left"
@@ -1013,7 +1035,9 @@ def read_events(
1013
1035
tree = html .parse (data )
1014
1036
teams = self ._parse_teams (tree )
1015
1037
for team , tid in zip (teams , ["a" , "b" ]):
1016
- html_events = tree .xpath (f"////*[@id='events_wrap']/div/div[@class='event { tid } ']" )
1038
+ html_events = tree .xpath (
1039
+ f"////*[@id='events_wrap']/div/div[@class='event { tid } ']"
1040
+ )
1017
1041
for e in html_events :
1018
1042
minute = e .xpath ("./div[1]" )[0 ].text .replace ("’" , "" ).strip ()
1019
1043
score = e .xpath ("./div[1]/small/span" )[0 ].text
@@ -1082,7 +1106,7 @@ def read_shot_events(
1082
1106
pd.DataFrame.
1083
1107
"""
1084
1108
urlmask = FBREF_API + "/en/matches/{}"
1085
- filemask = "match_ {}.csv"
1109
+ filemask = "shots_ {}.csv"
1086
1110
1087
1111
# Retrieve games for which a match report is available
1088
1112
df_schedule = self .read_schedule (force_cache ).reset_index ()
@@ -1132,8 +1156,7 @@ def read_shot_events(
1132
1156
standardize_colnames ,
1133
1157
cols = ["Outcome" , "Minute" , "Distance" , "Player" , "Body Part" , "Notes" , "Event" ],
1134
1158
)
1135
- .set_index (["league" , "season" , "game" ])
1136
- .sort_index ()
1159
+ .set_index (["id" ])
1137
1160
.dropna (how = "all" )
1138
1161
)
1139
1162
return df
@@ -1160,8 +1183,23 @@ def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame:
1160
1183
# remove thead rows in the table body
1161
1184
for elem in html_table .xpath ("//tbody/tr[contains(@class, 'thead')]" ):
1162
1185
elem .getparent ().remove (elem )
1186
+ # remove tfoot rows in the table body
1187
+ for elem in html_table .xpath ("//tfoot" ):
1188
+ elem .getparent ().remove (elem )
1189
+ # override ranking with a unique id for each player to use as index
1190
+ ids = []
1191
+ for elem in html_table .xpath (
1192
+ "tbody/tr/td[@data-stat='player']/a"
1193
+ " | tbody/tr/th[@data-stat='player']/a"
1194
+ " | tbody/tr/td[@data-stat='team']/a"
1195
+ " | tbody/tr/th[@data-stat='team']/a"
1196
+ ):
1197
+ id = elem .attrib ["href" ].split ("/" )[3 ]
1198
+ ids .append (id )
1163
1199
# parse HTML to dataframe
1164
1200
(df_table ,) = pd .read_html (html .tostring (html_table ), flavor = "lxml" )
1201
+ if len (ids ) > 0 :
1202
+ df_table .insert (0 , "id" , ids )
1165
1203
return df_table .convert_dtypes ()
1166
1204
1167
1205
0 commit comments