|
1 | 1 | name = 'nfl_data_py'
|
2 | 2 |
|
3 |
| -import datetime |
4 | 3 | import os
|
5 | 4 | import logging
|
| 5 | +import datetime |
| 6 | +from warnings import warn |
| 7 | +from typing import Iterable |
6 | 8 | from concurrent.futures import ThreadPoolExecutor, as_completed
|
7 | 9 |
|
8 |
| -import appdirs |
9 | 10 | import numpy
|
10 | 11 | import pandas
|
11 |
| -from typing import Iterable |
| 12 | +import appdirs |
| 13 | +from urllib.error import HTTPError |
12 | 14 |
|
13 | 15 | # module level doc string
|
14 | 16 | __doc__ = """
|
@@ -142,20 +144,32 @@ def import_pbp_data(
|
142 | 144 | raw = pandas.DataFrame(data)
|
143 | 145 | raw['season'] = year
|
144 | 146 |
|
145 |
| - if all([include_participation, year >= 2016, not cache]): |
| 147 | + |
| 148 | + if include_participation and not cache: |
146 | 149 | path = r'https://github.com/nflverse/nflverse-data/releases/download/pbp_participation/pbp_participation_{}.parquet'.format(year)
|
147 |
| - partic = pandas.read_parquet(path) |
148 |
| - raw = raw.merge(partic, how='left', on=['play_id','old_game_id']) |
| 150 | + |
| 151 | + try: |
| 152 | + partic = pandas.read_parquet(path) |
| 153 | + raw = raw.merge( |
| 154 | + partic, |
| 155 | + how='left', |
| 156 | + left_on=['play_id','game_id'], |
| 157 | + right_on=['play_id','nflverse_game_id'] |
| 158 | + ) |
| 159 | + except HTTPError: |
| 160 | + pass |
149 | 161 |
|
150 | 162 | pbp_data.append(raw)
|
151 | 163 | print(str(year) + ' done.')
|
152 | 164 |
|
153 |
| - except Error as e: |
| 165 | + except Exception as e: |
154 | 166 | print(e)
|
155 | 167 | print('Data not available for ' + str(year))
|
156 | 168 |
|
157 |
| - if pbp_data: |
158 |
| - plays = pandas.concat(pbp_data).reset_index(drop=True) |
| 169 | + if not pbp_data: |
| 170 | + return pandas.DataFrame() |
| 171 | + |
| 172 | + plays = pandas.concat(pbp_data, ignore_index=True) |
159 | 173 |
|
160 | 174 | # converts float64 to float32, saves ~30% memory
|
161 | 175 | if downcast:
|
@@ -183,12 +197,10 @@ def cache_pbp(years, downcast=True, alt_path=None):
|
183 | 197 | if min(years) < 1999:
|
184 | 198 | raise ValueError('Data not available before 1999.')
|
185 | 199 |
|
186 |
| - plays = pandas.DataFrame() |
187 |
| - |
188 | 200 | url1 = r'https://github.com/nflverse/nflverse-data/releases/download/pbp/play_by_play_'
|
189 | 201 | url2 = r'.parquet'
|
190 | 202 | appname = 'nfl_data_py'
|
191 |
| - appauthor = 'cooper_dff' |
| 203 | + appauthor = 'nflverse' |
192 | 204 |
|
193 | 205 | # define path for caching
|
194 | 206 | if alt_path is not None:
|
@@ -230,7 +242,15 @@ def cache_pbp(years, downcast=True, alt_path=None):
|
230 | 242 |
|
231 | 243 | print(str(year) + ' done.')
|
232 | 244 |
|
233 |
| - except: |
| 245 | + except Exception as e: |
| 246 | + warn( |
| 247 | + f"Caching failed for {year}, skipping.\n" |
| 248 | + "In nfl_data_py 1.0, this will raise an exception.\n" |
| 249 | + f"Failure: {e}", |
| 250 | + DeprecationWarning, |
| 251 | + stacklevel=2 |
| 252 | + ) |
| 253 | + |
234 | 254 | next
|
235 | 255 |
|
236 | 256 |
|
@@ -432,7 +452,7 @@ def __import_rosters(release, years, columns=None):
|
432 | 452 | rosters = pandas.concat([
|
433 | 453 | pandas.read_parquet(uri.format(y))
|
434 | 454 | for y in years
|
435 |
| - ]) |
| 455 | + ], ignore_index=True) |
436 | 456 |
|
437 | 457 | # Post-import processing
|
438 | 458 | rosters['birth_date'] = pandas.to_datetime(rosters.birth_date)
|
@@ -728,52 +748,32 @@ def import_ids(columns=None, ids=None):
|
728 | 748 | """Import mapping table of ids for most major data providers
|
729 | 749 |
|
730 | 750 | Args:
|
731 |
| - columns (List[str]): list of columns to return |
732 |
| - ids (List[str]): list of specific ids to return |
| 751 | + columns (Iterable[str]): list of columns to return |
| 752 | + ids (Iterable[str]): list of specific ids to return |
733 | 753 |
|
734 | 754 | Returns:
|
735 | 755 | DataFrame
|
736 | 756 | """
|
737 |
| - |
738 |
| - # create list of id options |
739 |
| - avail_ids = ['mfl_id', 'sportradar_id', 'fantasypros_id', 'gsis_id', 'pff_id', |
740 |
| - 'sleeper_id', 'nfl_id', 'espn_id', 'yahoo_id', 'fleaflicker_id', |
741 |
| - 'cbs_id', 'rotowire_id', 'rotoworld_id', 'ktc_id', 'pfr_id', |
742 |
| - 'cfbref_id', 'stats_id', 'stats_global_id', 'fantasy_data_id'] |
743 |
| - avail_sites = [x[:-3] for x in avail_ids] |
744 |
| - |
745 |
| - # check variable types |
746 |
| - if columns is None: |
747 |
| - columns = [] |
748 |
| - |
749 |
| - if ids is None: |
750 |
| - ids = [] |
751 | 757 |
|
752 |
| - if not isinstance(columns, list): |
753 |
| - raise ValueError('columns variable must be list.') |
754 |
| - |
755 |
| - if not isinstance(ids, list): |
756 |
| - raise ValueError('ids variable must be list.') |
757 |
| - |
758 |
| - # confirm id is in table |
759 |
| - if False in [x in avail_sites for x in ids]: |
760 |
| - raise ValueError('ids variable can only contain ' + ', '.join(avail_sites)) |
| 758 | + columns = columns or [] |
| 759 | + if not isinstance(columns, Iterable): |
| 760 | + raise ValueError('columns argument must be a list.') |
| 761 | + |
| 762 | + ids = ids or [] |
| 763 | + if not isinstance(ids, Iterable): |
| 764 | + raise ValueError('ids argument must be a list.') |
761 | 765 |
|
762 |
| - # import data |
763 |
| - df = pandas.read_csv(r'https://raw.githubusercontent.com/dynastyprocess/data/master/files/db_playerids.csv') |
| 766 | + df = pandas.read_csv("https://raw.githubusercontent.com/dynastyprocess/data/master/files/db_playerids.csv") |
764 | 767 |
|
765 |
| - rem_cols = [x for x in df.columns if x not in avail_ids] |
766 |
| - tgt_ids = [x + '_id' for x in ids] |
767 |
| - |
768 |
| - # filter df to just specified columns |
769 |
| - if len(columns) > 0 and len(ids) > 0: |
770 |
| - df = df[set(tgt_ids + columns)] |
771 |
| - elif len(columns) > 0 and len(ids) == 0: |
772 |
| - df = df[set(avail_ids + columns)] |
773 |
| - elif len(columns) == 0 and len(ids) > 0: |
774 |
| - df = df[set(tgt_ids + rem_cols)] |
| 768 | + id_cols = [c for c in df.columns if c.endswith('_id')] |
| 769 | + non_id_cols = [c for c in df.columns if not c.endswith('_id')] |
775 | 770 |
|
776 |
| - return df |
| 771 | + # filter df to just specified ids + columns |
| 772 | + ret_ids = [x + '_id' for x in ids] or id_cols |
| 773 | + ret_cols = columns or non_id_cols |
| 774 | + ret_columns = list(set([*ret_ids, *ret_cols])) |
| 775 | + |
| 776 | + return df[ret_columns] |
777 | 777 |
|
778 | 778 |
|
779 | 779 | def import_contracts():
|
@@ -1139,33 +1139,18 @@ def clean_nfl_data(df):
|
1139 | 1139 | 'Louisiana State': 'LSU'
|
1140 | 1140 | }
|
1141 | 1141 |
|
1142 |
| - pro_tm_repl = { |
1143 |
| - 'GNB': 'GB', |
1144 |
| - 'KAN': 'KC', |
1145 |
| - 'LA': 'LAR', |
1146 |
| - 'LVR': 'LV', |
1147 |
| - 'NWE': 'NE', |
1148 |
| - 'NOR': 'NO', |
1149 |
| - 'SDG': 'SD', |
1150 |
| - 'SFO': 'SF', |
1151 |
| - 'TAM': 'TB' |
1152 |
| - } |
1153 |
| - |
1154 | 1142 | na_replace = {
|
1155 | 1143 | 'NA':numpy.nan
|
1156 | 1144 | }
|
1157 | 1145 |
|
1158 | 1146 | for col in df.columns:
|
1159 |
| - df.replace({col:na_replace}, inplace=True) |
| 1147 | + if df[col].dtype == 'object': |
| 1148 | + df.replace({col:na_replace}, inplace=True) |
1160 | 1149 |
|
1161 | 1150 | if 'name' in df.columns:
|
1162 | 1151 | df.replace({'name': name_repl}, inplace=True)
|
1163 | 1152 |
|
1164 | 1153 | if 'col_team' in df.columns:
|
1165 | 1154 | df.replace({'col_team': col_tm_repl}, inplace=True)
|
1166 | 1155 |
|
1167 |
| - if 'name' in df.columns: |
1168 |
| - for z in player_col_tm_repl: |
1169 |
| - df[df['name'] == z[0]] = df[df['name'] == z[0]].replace({z[1]: z[2]}) |
1170 |
| - |
1171 | 1156 | return df
|
0 commit comments