Table of Contents
- Baseball
This package fetches and parses event data for Major League Baseball games. Game objects generated via the _from_url methods pull data from MLB endpoints where events are published within about 30 seconds of occurring. This XML/JSON source data zip file contains event data from MLB games 1974 - 2020.
pip3 install baseball
git clone git@github.com:benjamincrom/baseball.git
cd baseball/
python3 setup.py install
- get_game_from_url(date_str, away_code, home_code, game_number)
Fetch an object which contains metadata and events for a single MLB game.
import baseball
game_id, game = baseball.get_game_from_url('2017-11-1', 'HOU', 'LAD', 1)
game_dict = game._asdict()
game_json_str = game.json()
Write scorecard as SVG image:
with open(game_id + '.svg', 'w') as fh:
fh.write(game.get_svg_str())
- away_batter_box_score_dict
- away_pitcher_box_score_dict
- away_team (Team)
- away_team_stats
- start_datetime
- expected_start_datetime
- game_date_str
- home_batter_box_score_dict
- home_pitcher_box_score_dict
- home_team (Team)
- home_team_stats
- inning_list (Inning list)
- end_datetime
- location
- attendance
- weather
- temp
- timezone_str
- is_postponed
- is_suspended
- is_doubleheader
- is_today
- get_svg_str()
- json()
- _asdict()
- abbreviation
- batting_order_list_list (list of nine PlayerAppearance lists)
- name
- pitcher_list (PlayerAppearance list)
- player_id_dict
- player_last_name_dict
- player_name_dict
- _asdict()
- bottom_half_appearance_list (PlateAppearance list)
- bottom_half_inning_stats
- top_half_appearance_list (PlateAppearance list)
- top_half_inning_stats
- _asdict()
- start_datetime
- end_datetime
- batter (Player)
- batting_team (Team)
- error_str
- event_list (list of Pitch, Pickoff, RunnerAdvance, Substitution, Switch objects)
- got_on_base
- hit_location
- inning_outs
- out_runners_list (Player list)
- pitcher (Player)
- plate_appearance_description
- plate_appearance_summary
- runners_batted_in_list (Player list)
- scorecard_summary
- scoring_runners_list (Player list)
- _asdict()
- era
- first_name
- last_name
- mlb_id
- number
- obp
- slg
- _asdict()
- start_inning_batter_num
- start_inning_half
- start_inning_num
- end_inning_batter_num
- end_inning_half
- end_inning_num
- pitcher_credit_code
- player_obj (Player)
- position
- _asdict()
- pitch_datetime
- pitch_description
- pitch_position
- pitch_speed
- pitch_type
- _asdict()
- pickoff_description
- pickoff_base
- pickoff_was_successful
- _asdict()
- runner_advance_datetime
- run_description
- runner (Player)
- start_base
- end_base
- runner_scored
- run_earned
- is_rbi
- _asdict()
- substitution_datetime
- incoming_player (Player)
- outgoing_player (Player)
- batting_order
- position
- _asdict()
- switch_datetime
- player (Player)
- old_position_num
- new_position_num
- new_batting_order
- _asdict()
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import baseball
%matplotlib inline
game_id, game = baseball.get_game_from_url('11-1-2017', 'HOU', 'LAD', 1)
pitch_tuple_list = []
for inning in game.inning_list:
for appearance in inning.top_half_appearance_list:
for event in appearance.event_list:
if isinstance(event, baseball.Pitch):
pitch_tuple_list.append(
(str(appearance.pitcher),
event.pitch_description,
event.pitch_position,
event.pitch_speed,
event.pitch_type)
)
data = pd.DataFrame(data=pitch_tuple_list, columns=['Pitcher', 'Pitch Description', 'Pitch Coordinate', 'Pitch Speed', 'Pitch Type'])
data.head()
Pitcher | Pitch Description | Pitch Coordinate | Pitch Speed | Pitch Type | |
---|---|---|---|---|---|
0 | 21 Yu Darvish | Ball | (155.47, 160.83) | 96.0 | FF |
1 | 21 Yu Darvish | Called Strike | (107.0, 171.09) | 83.9 | FC |
2 | 21 Yu Darvish | In play, no out | (115.36, 183.1) | 83.9 | SL |
3 | 21 Yu Darvish | In play, run(s) | (80.06, 168.03) | 96.6 | FF |
4 | 21 Yu Darvish | Ball | (54.1, 216.52) | 84.6 | SL |
data['Pitcher'].value_counts().plot.bar()
for pitcher in data['Pitcher'].unique():
plt.ylim(0, 125)
plt.xlim(0, 250)
bx = [250 - x[2][0] for x in pitch_tuple_list if x[0] == pitcher if 'Ball' in x[1]]
by = [250 - x[2][1] for x in pitch_tuple_list if x[0] == pitcher if 'Ball' in x[1]]
cx = [250 - x[2][0] for x in pitch_tuple_list if x[0] == pitcher if 'Called Strike' in x[1]]
cy = [250 - x[2][1] for x in pitch_tuple_list if x[0] == pitcher if 'Called Strike' in x[1]]
ox = [250 - x[2][0] for x in pitch_tuple_list if x[0] == pitcher if ('Ball' not in x[1] and 'Called Strike' not in x[1])]
oy = [250 - x[2][1] for x in pitch_tuple_list if x[0] == pitcher if ('Ball' not in x[1] and 'Called Strike' not in x[1])]
b = plt.scatter(bx, by, c='b')
c = plt.scatter(cx, cy, c='r')
o = plt.scatter(ox, oy, c='g')
plt.legend((b, c, o),
('Ball', 'Called Strike', 'Other'),
scatterpoints=1,
loc='upper right',
ncol=1,
fontsize=8)
plt.title(pitcher)
plt.show()
plt.axis('equal')
data['Pitch Description'].value_counts().plot(kind='pie', radius=1.5, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)
data.plot.kde()
fig, ax = plt.subplots()
ax.set_xlim(50, 120)
for pitcher in data['Pitcher'].unique():
s = data[data['Pitcher'] == pitcher]['Pitch Speed']
s.plot.kde(ax=ax, label=pitcher)
ax.legend()
fig, ax = plt.subplots()
ax.set_xlim(50, 120)
for desc in data['Pitch Type'].unique():
s = data[data['Pitch Type'] == desc]['Pitch Speed']
s.plot.kde(ax=ax, label=desc)
ax.legend()
fig, ax = plt.subplots(figsize=(15,7))
data.groupby(['Pitcher', 'Pitch Description']).size().unstack().plot.bar(ax=ax)
game_list_2017 = baseball.get_game_list_from_file_range('1-1-2017', '12-31-2017', '/Users/benjamincrom/repos/livebaseballscorecards-artifacts/baseball_files')
pitch_tuple_list_2 = []
for game_id, game in game_list_2017:
if game.home_team.name == 'Atlanta Braves' or game.away_team.name == 'Atlanta Braves':
for inning in game.inning_list:
for appearance in (inning.top_half_appearance_list +
(inning.bottom_half_appearance_list or [])):
if 'Dickey' in str(appearance.pitcher):
for event in appearance.event_list:
if isinstance(event, baseball.Pitch):
pitch_tuple_list_2.append(
(str(appearance.pitcher),
event.pitch_description,
event.pitch_position,
event.pitch_speed,
event.pitch_type)
)
df = pd.DataFrame(data=pitch_tuple_list_2, columns=['Pitcher', 'Pitch Description', 'Pitch Coordinate', 'Pitch Speed', 'Pitch Type'])
df['Pitch Type'].value_counts().plot.bar()
plt.axis('equal')
df['Pitch Description'].value_counts().plot(kind='pie', radius=2, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)
plt.ylabel('')
plt.show()
df.dropna(inplace=True)
ax.set_xlim(50, 100)
df.plot.kde()
ax.legend()
fig, ax = plt.subplots()
ax.set_xlim(50, 100)
for desc in df['Pitch Type'].unique():
if desc != 'PO':
s = df[df['Pitch Type'] == desc]['Pitch Speed']
s.plot.kde(ax=ax, label=desc)
ax.legend()
import datetime
import dateutil.parser
import pytz
pitch_tuple_list_3 = []
for game_id, game in game_list_2017:
if game.home_team.name == 'Atlanta Braves' and dateutil.parser.parse(game.game_date_str) > datetime.datetime(2017, 3, 31):
for inning in game.inning_list:
for appearance in inning.top_half_appearance_list:
pitch_tuple_list_3.append(
(str(appearance.pitcher),
str(appearance.batter),
len(appearance.out_runners_list),
len(appearance.scoring_runners_list),
len(appearance.runners_batted_in_list),
appearance.scorecard_summary,
appearance.got_on_base,
appearance.plate_appearance_summary,
appearance.plate_appearance_description,
appearance.error_str,
appearance.inning_outs)
)
if game.away_team.name == 'Atlanta Braves' and dateutil.parser.parse(game.game_date_str) > datetime.datetime(2017, 3, 31):
for inning in game.inning_list:
if inning.bottom_half_appearance_list:
for appearance in inning.bottom_half_appearance_list:
pitch_tuple_list_3.append(
(str(appearance.pitcher),
str(appearance.batter),
len(appearance.out_runners_list),
len(appearance.scoring_runners_list),
len(appearance.runners_batted_in_list),
appearance.scorecard_summary,
appearance.got_on_base,
appearance.plate_appearance_summary,
appearance.plate_appearance_description,
appearance.error_str,
appearance.inning_outs)
)
df3 = pd.DataFrame(data=pitch_tuple_list_3, columns=['Pitcher',
'Batter',
'Out Runners',
'Scoring Runners',
'RBIs',
'Scorecard',
'On-base?',
'Plate Summary',
'Plate Description',
'Error',
'Inning Outs'])
for pitcher in df3['Pitcher'].unique():
summary = df3[df3['Pitcher'] == pitcher]['Plate Summary']
s = summary.value_counts(sort=False)
if len(summary) > 400:
fig, ax = plt.subplots()
ax.set_ylim(0, 250)
s.plot.bar()
plt.title(pitcher)
plt.show()
x = []
for pitcher in df3['Pitcher'].unique():
#f = df3[df3['Pitcher'] == pitcher]['On-base?'].value_counts()[0]
s = df3[df3['Pitcher'] == pitcher]['On-base?'].value_counts()
if len(s) == 2:
f = s[0]
t = s[1]
x.append((str(pitcher), f, t))
df4 = pd.DataFrame(data=x, columns=['Pitcher',
'Did not get on base',
'Got on base'])
df4.index = df4['Pitcher']
df4.sort_values(by=['Got on base']).nlargest(10, 'Did not get on base').plot.bar()