1
1
name = 'nfl_data_py'
2
2
3
- import datetime
4
3
import os
5
4
import logging
5
+ import datetime
6
+ from warnings import warn
7
+ from typing import Iterable
6
8
from concurrent .futures import ThreadPoolExecutor , as_completed
7
9
8
- import appdirs
9
10
import numpy
10
11
import pandas
11
- from typing import Iterable
12
+ import appdirs
13
+ from urllib .error import HTTPError
12
14
13
15
# module level doc string
14
16
__doc__ = """
@@ -142,20 +144,32 @@ def import_pbp_data(
142
144
raw = pandas .DataFrame (data )
143
145
raw ['season' ] = year
144
146
145
- if all ([include_participation , year >= 2016 , not cache ]):
147
+
148
+ if include_participation and not cache :
146
149
path = r'https://github.com/nflverse/nflverse-data/releases/download/pbp_participation/pbp_participation_{}.parquet' .format (year )
147
- partic = pandas .read_parquet (path )
148
- raw = raw .merge (partic , how = 'left' , on = ['play_id' ,'old_game_id' ])
150
+
151
+ try :
152
+ partic = pandas .read_parquet (path )
153
+ raw = raw .merge (
154
+ partic ,
155
+ how = 'left' ,
156
+ left_on = ['play_id' ,'game_id' ],
157
+ right_on = ['play_id' ,'nflverse_game_id' ]
158
+ )
159
+ except HTTPError :
160
+ pass
149
161
150
162
pbp_data .append (raw )
151
163
print (str (year ) + ' done.' )
152
164
153
- except Error as e :
165
+ except Exception as e :
154
166
print (e )
155
167
print ('Data not available for ' + str (year ))
156
168
157
- if pbp_data :
158
- plays = pandas .concat (pbp_data ).reset_index (drop = True )
169
+ if not pbp_data :
170
+ return pandas .DataFrame ()
171
+
172
+ plays = pandas .concat (pbp_data , ignore_index = True )
159
173
160
174
# converts float64 to float32, saves ~30% memory
161
175
if downcast :
@@ -183,12 +197,10 @@ def cache_pbp(years, downcast=True, alt_path=None):
183
197
if min (years ) < 1999 :
184
198
raise ValueError ('Data not available before 1999.' )
185
199
186
- plays = pandas .DataFrame ()
187
-
188
200
url1 = r'https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_'
189
201
url2 = r'.parquet'
190
202
appname = 'nfl_data_py'
191
- appauthor = 'cooper_dff '
203
+ appauthor = 'nflverse '
192
204
193
205
# define path for caching
194
206
if alt_path is not None :
@@ -230,7 +242,15 @@ def cache_pbp(years, downcast=True, alt_path=None):
230
242
231
243
print (str (year ) + ' done.' )
232
244
233
- except :
245
+ except Exception as e :
246
+ warn (
247
+ f"Caching failed for { year } , skipping.\n "
248
+ "In nfl_data_py 1.0, this will raise an exception.\n "
249
+ f"Failure: { e } " ,
250
+ DeprecationWarning ,
251
+ stacklevel = 2
252
+ )
253
+
234
254
next
235
255
236
256
@@ -432,7 +452,7 @@ def __import_rosters(release, years, columns=None):
432
452
rosters = pandas .concat ([
433
453
pandas .read_parquet (uri .format (y ))
434
454
for y in years
435
- ])
455
+ ], ignore_index = True )
436
456
437
457
# Post-import processing
438
458
rosters ['birth_date' ] = pandas .to_datetime (rosters .birth_date )
@@ -728,52 +748,32 @@ def import_ids(columns=None, ids=None):
728
748
"""Import mapping table of ids for most major data providers
729
749
730
750
Args:
731
- columns (List [str]): list of columns to return
732
- ids (List [str]): list of specific ids to return
751
+ columns (Iterable [str]): list of columns to return
752
+ ids (Iterable [str]): list of specific ids to return
733
753
734
754
Returns:
735
755
DataFrame
736
756
"""
737
-
738
- # create list of id options
739
- avail_ids = ['mfl_id' , 'sportradar_id' , 'fantasypros_id' , 'gsis_id' , 'pff_id' ,
740
- 'sleeper_id' , 'nfl_id' , 'espn_id' , 'yahoo_id' , 'fleaflicker_id' ,
741
- 'cbs_id' , 'rotowire_id' , 'rotoworld_id' , 'ktc_id' , 'pfr_id' ,
742
- 'cfbref_id' , 'stats_id' , 'stats_global_id' , 'fantasy_data_id' ]
743
- avail_sites = [x [:- 3 ] for x in avail_ids ]
744
-
745
- # check variable types
746
- if columns is None :
747
- columns = []
748
-
749
- if ids is None :
750
- ids = []
751
757
752
- if not isinstance (columns , list ):
753
- raise ValueError ('columns variable must be list.' )
754
-
755
- if not isinstance (ids , list ):
756
- raise ValueError ('ids variable must be list.' )
757
-
758
- # confirm id is in table
759
- if False in [x in avail_sites for x in ids ]:
760
- raise ValueError ('ids variable can only contain ' + ', ' .join (avail_sites ))
758
+ columns = columns or []
759
+ if not isinstance (columns , Iterable ):
760
+ raise ValueError ('columns argument must be a list.' )
761
+
762
+ ids = ids or []
763
+ if not isinstance (ids , Iterable ):
764
+ raise ValueError ('ids argument must be a list.' )
761
765
762
- # import data
763
- df = pandas .read_csv (r'https://raw.githubusercontent.com/dynastyprocess/data/master/files/db_playerids.csv' )
766
+ df = pandas .read_csv ("https://raw.githubusercontent.com/dynastyprocess/data/master/files/db_playerids.csv" )
764
767
765
- rem_cols = [x for x in df .columns if x not in avail_ids ]
766
- tgt_ids = [x + '_id' for x in ids ]
767
-
768
- # filter df to just specified columns
769
- if len (columns ) > 0 and len (ids ) > 0 :
770
- df = df [set (tgt_ids + columns )]
771
- elif len (columns ) > 0 and len (ids ) == 0 :
772
- df = df [set (avail_ids + columns )]
773
- elif len (columns ) == 0 and len (ids ) > 0 :
774
- df = df [set (tgt_ids + rem_cols )]
768
+ id_cols = [c for c in df .columns if c .endswith ('_id' )]
769
+ non_id_cols = [c for c in df .columns if not c .endswith ('_id' )]
775
770
776
- return df
771
+ # filter df to just specified ids + columns
772
+ ret_ids = [x + '_id' for x in ids ] or id_cols
773
+ ret_cols = columns or non_id_cols
774
+ ret_columns = list (set ([* ret_ids , * ret_cols ]))
775
+
776
+ return df [ret_columns ]
777
777
778
778
779
779
def import_contracts ():
@@ -916,8 +916,8 @@ def import_qbr(years=None, level='nfl', frequency='season'):
916
916
917
917
918
918
def __validate_pfr_inputs (s_type , years = None ):
919
- if s_type not in ('pass' , 'rec' , 'rush' ):
920
- raise ValueError ('s_type variable must be one of "pass", "rec", or "rush ".' )
919
+ if s_type not in ('pass' , 'rec' , 'rush' , 'def' ):
920
+ raise ValueError ('s_type variable must be one of "pass", "rec","rush", or "def ".' )
921
921
922
922
if years is None :
923
923
return []
@@ -939,7 +939,7 @@ def import_seasonal_pfr(s_type, years=None):
939
939
"""Import PFR advanced season-level statistics
940
940
941
941
Args:
942
- s_type (str): must be one of pass, rec, rush
942
+ s_type (str): must be one of pass, rec, rush, def
943
943
years (List[int]): years to return data for, optional
944
944
Returns:
945
945
DataFrame
@@ -957,7 +957,7 @@ def import_weekly_pfr(s_type, years=None):
957
957
"""Import PFR advanced week-level statistics
958
958
959
959
Args:
960
- s_type (str): must be one of pass, rec, rush
960
+ s_type (str): must be one of pass, rec, rush, def
961
961
years (List[int]): years to return data for, optional
962
962
Returns:
963
963
DataFrame
@@ -1139,33 +1139,18 @@ def clean_nfl_data(df):
1139
1139
'Louisiana State' : 'LSU'
1140
1140
}
1141
1141
1142
- pro_tm_repl = {
1143
- 'GNB' : 'GB' ,
1144
- 'KAN' : 'KC' ,
1145
- 'LA' : 'LAR' ,
1146
- 'LVR' : 'LV' ,
1147
- 'NWE' : 'NE' ,
1148
- 'NOR' : 'NO' ,
1149
- 'SDG' : 'SD' ,
1150
- 'SFO' : 'SF' ,
1151
- 'TAM' : 'TB'
1152
- }
1153
-
1154
1142
na_replace = {
1155
1143
'NA' :numpy .nan
1156
1144
}
1157
1145
1158
1146
for col in df .columns :
1159
- df .replace ({col :na_replace }, inplace = True )
1147
+ if df [col ].dtype == 'object' :
1148
+ df .replace ({col :na_replace }, inplace = True )
1160
1149
1161
1150
if 'name' in df .columns :
1162
1151
df .replace ({'name' : name_repl }, inplace = True )
1163
1152
1164
1153
if 'col_team' in df .columns :
1165
1154
df .replace ({'col_team' : col_tm_repl }, inplace = True )
1166
1155
1167
- if 'name' in df .columns :
1168
- for z in player_col_tm_repl :
1169
- df [df ['name' ] == z [0 ]] = df [df ['name' ] == z [0 ]].replace ({z [1 ]: z [2 ]})
1170
-
1171
1156
return df
0 commit comments