-
Notifications
You must be signed in to change notification settings - Fork 0
/
MLBWhiff.py
194 lines (134 loc) · 8.01 KB
/
MLBWhiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#%%
### IMPORTING LIBRARIES TO USE, READING IN DATA ###
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from collections import Counter
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
whiffData = pd.read_csv('PitcherXData.csv')
#%%
### INITIAL DATA EXPLORATION AND CLEANING ###
## Identify categorical variables and numeric variables
numericCols = whiffData.select_dtypes(include=np.number).columns
categoricalCols = list(set(whiffData.columns) - set(numericCols))
print(numericCols)
print(categoricalCols)
## Detect Null values
whiffNulls = whiffData[whiffData.isnull().any(axis=1)]
## Confirm that Null values do not have significant skew before dropping null rows
## We want to avoid a situation where the values in the null rows significantly change the overall data
for col in numericCols:
originalMean = whiffData[col].mean()
nullMean = whiffNulls[col].mean()
print(col, ': ', originalMean, ' ', nullMean)
for col in categoricalCols:
originalSet = set(whiffData[col])
nullSet = set(whiffNulls[col])
print(col)
print(originalSet)
print(nullSet)
## Worth looking into InducedVertBreak and HorzBreak from first glance
## Induced vertical break has a standard deviation of around 6.5 - the null rows' aggregate InducedVertBreak is within 1 std of the main set
whiffData['InducedVertBreak'].std()
## HorzBreak is over one standard deviation, but not by much. It is close, but acceptable to drop - no skew!
whiffData['HorzBreak'].std()
## All the numeric columns look like they display no skew when comparing the null rows and the non null rows (main set)
whiffData = whiffData.dropna()
#%%
### OUTLIER DETECTION AND HANDLING ###
## For numeric variables, do outlier detection
whiffNumeric = whiffData[numericCols]
for col in numericCols:
outliers = whiffNumeric[~(np.abs(whiffNumeric[col] - whiffNumeric[col].mean()) < (3 *whiffNumeric[col].std()))]
if len(outliers) > 0:
print(col, len(outliers))
## After a cursory analysis of the columns with outliers (looking at the range and standard deviation), only SpinRate is of note
## Problem: several zeros in data (very unlikely), around ~50 rows
## Options: impute data OR get rid of 50 more rows. To decide, we conduct another skew analysis
whiffZeros = whiffData[whiffData.SpinRate == 0]
for col in numericCols:
originalMean = whiffData[col].mean()
zeroMean = whiffZeros[col].mean()
print(col, ': ', originalMean, ' ', zeroMean)
for col in categoricalCols:
originalSet = set(whiffData[col])
zeroSet = set(whiffZeros[col])
print(col)
print(originalSet)
print(zeroSet)
## The dataset where the Spin Rate is zero is almost identical on average to the main data set, so no skew detected
## As a result, we simply remove the 50 rows where the Spin Rate is 0 (likely a data quality issue)
whiffData = whiffData[whiffData.SpinRate != 0]
# %%
### CORRELATION ANALYSIS ###
## Here we build a correlation matrix
corrMatrix = pd.DataFrame(whiffData.corr()).abs()
corrMatrix.loc['average'] = corrMatrix.mean()
high_corrs = []
for idx,row in corrMatrix.iterrows():
for col in corrMatrix.columns:
if (row[col] < 1) & (row[col] > 0.75):
high_corrs.append(col)
print(Counter(high_corrs))
## Pitch of Plate Appearance is correlated with Balls and Strikes, this we need to solve
## Release Speed is correlated (as expected) with Induced Vertical Break (these are considered distinct, so we keep both, despite the correlation)
## Pitch of Plate Appearance, Balls, and Strikes actually combine to say the same thing: Count
## So, instead of having 3 variables here, we construct a simple 'Count' categorical value
whiffData['Count'] = (whiffData.Balls).astype('str') + (whiffData.Strikes).astype('str')
categoricalCols.append('Count')
## Making a full copy before slimming down whiffData (there was only one value in Pitcher, so that column gets dropped)
## We also drop PitcherThrows given that only 3 records total have the pitcher throwing right handed
whiffDataFull = whiffData
whiffData = whiffData.drop(['Balls', 'Strikes', 'PitchofPA', 'Pitcher', 'PitcherThrows'], axis=1)
# %%
### FEATURE ENGINEERING ###
## For this analysis, we don't use Year or Date, given that we expect some change in the pitcher's behavior, it does not make sense to extrapolate trends over time (changes in behavior can change the nature of these trends)
whiffRegressionData = whiffData.drop(['Date', 'Year'], axis = 1)
## Here we want to create dummy variables for our categorical variables
## We choose this over other techniques, such as target encoding because we don't have too many categorical variables
whiffRegressionData = pd.get_dummies(whiffRegressionData)
#%%
### TARGET DISTRIBUTION ANALYSIS ###
whiffRegressionData['whiff_prob'].hist(bins = 10)
val = 0
for i in range(0, 10):
print(val)
print(len(whiffRegressionData[(whiffRegressionData.whiff_prob < val + .05) & (whiffRegressionData.whiff_prob > val)]))
val += .05
print(val)
print('---------------')
whiffRegressionData['whiff_prob_category'] = 0
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.05) & (whiffRegressionData['whiff_prob'] > 0), 1, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.1) & (whiffRegressionData['whiff_prob'] > 0.05), 2, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.15) & (whiffRegressionData['whiff_prob'] > 0.1), 3, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.2) & (whiffRegressionData['whiff_prob'] > 0.15), 4, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.25) & (whiffRegressionData['whiff_prob'] > 0.2), 5, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.3) & (whiffRegressionData['whiff_prob'] > 0.25), 6, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] < 0.35) & (whiffRegressionData['whiff_prob'] > 0.3), 7, whiffRegressionData['whiff_prob_category'])
whiffRegressionData['whiff_prob_category'] = np.where((whiffRegressionData['whiff_prob'] > 0.35), 8, whiffRegressionData['whiff_prob_category'])
groupedMeans = pd.DataFrame(whiffRegressionData.groupby(['whiff_prob_category']).mean())
groupedMeans['category'] = groupedMeans.index
# %%
groupedMeans[['Inning', 'PAofInning', 'ReleaseSpeed', 'InducedVertBreak', 'HorzBreak',
'ReleaseHeight', 'ReleaseSide', 'Extension', 'PlateHeight', 'PlateSide',
'SpinRate', 'SpinAxis', 'swing_prob',
'BatterSide_Left',
'BatterSide_Right', 'PitchType_CHANGEUP', 'PitchType_FASTBALL',
'PitchType_SLIDER']]
sigVars = ['ReleaseSpeed', 'InducedVertBreak', 'HorzBreak', 'PlateHeight', 'SpinRate', 'SpinAxis', 'swing_prob']
## PLAN
## use work from target distrubiton analysis to figure out which variables to improve
## train an actually good model, predict whiff prob using status quo data and then show that with improved variabled, the prediction yields a higher whiff prob
## create a what if scenario: if spin rate improves by X --> whiff prob improves by y%
## OR if spin rate improves by X and horz break improves by Y --> whif prob improves by z%
# %%
whiffRegressionData['PlateHeight'].std()
for threshhold in [2.5, 2.4, 2.3, 2.2, 2.1, 2, 1.9, 1.8, 1.7, 1.6, 1.5, 1.4, 1.3, 1.2, 1.1, 1, .9, .8, .7, .6, .5]:
avg = whiffRegressionData[abs(whiffRegressionData.PlateHeight - threshhold) < .2]['whiff_prob'].mean()
print(threshhold, avg)
# %%
whiffRegressionData['PlateSide'].std()
whiffRegressionData['PlateHeight'].hist(bins = 20)
# %%