Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add team splits retrieval from baseball reference #359

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions docs/split_stats.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,57 @@ df, player_info_dict = get_splits('troutmi01', player_info=True)
#find the pitching split stats for Jon Lester
df = get_splits('lestejo01', pitching_splits=True)
```

# Get Team Splits Stats

`get_team_splits(team, year, pitching_splits = False)`
Look up a team's split stats from baseball-reference. Either batting or pitching splits can be returned and splits can be provided for any season.
Split stats are returned as a multi-index dataframe by split category and split.
## Arguments

`team:` String. The team's baseball-reference code.
|Team Name|Team Code|
|:--:|:--:|
|Angels|ANA|
|Astros|HOU|
|Athletics|OAK|
|Blue Jays|TOR|
|Braves|ATL|
|Brewers|MIL|
|Cardinals|STL|
|Cubs|CHC|
|Rays|TBD|
|Diamondbacks|ARI|
|Dodgers|LAD|
|Giants|SFG|
|Indians (Guardians)|CLE|
|Mariners|SEA|
|Marlins|FLA|
|Mets|NYM|
|Nationals|WSN|
|Orioles|BAL|
|Padres|SDP|
|Phillies|PHI|
|Pirates|PIT|
|Rangers|TEX|
|Red Sox|BOS|
|Reds|CIN|
|Rockies|COL|
|Royals|KCR|
|Tigers|DET|
|Twins|MIN|
|White Sox|CHW|
|Yankees|NYY|

`year:` Integer. The year to get split stats for.

`pitching_splits:` Boolean. Optional. If set to True, the get_splits function will return pitching splits. Otherwise, get_splits will return batting splits.

## Examples of valid queries

```python
from pybaseball import get_splits

# find the split stats for the Braves for 2023
df = get_team_splits('ATL', 2023)
```
1 change: 1 addition & 0 deletions pybaseball/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,5 @@
from .datasources.fangraphs import (fg_batting_data, fg_pitching_data, fg_team_batting_data, fg_team_fielding_data,
fg_team_pitching_data)
from .split_stats import get_splits
from .split_stats import get_team_splits
from .version import __version__
54 changes: 41 additions & 13 deletions pybaseball/split_stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import bs4 as bs
import pandas as pd
Expand All @@ -8,6 +8,16 @@

session = BRefSession()

def get_team_split_soup(team: str, year: int, pitching_splits: bool = False) -> bs.BeautifulSoup:
"""
gets soup for the team splits.
"""
pitch_or_bat = 'p' if pitching_splits else 'b'
url = f"https://www.baseball-reference.com/teams/split.cgi?t={pitch_or_bat}&team={team}&year={year}"
html = session.get(url).content
soup = bs.BeautifulSoup(html, 'lxml')
return soup


def get_split_soup(playerid: str, year: Optional[int] = None, pitching_splits: bool = False) -> bs.BeautifulSoup:
"""
Expand Down Expand Up @@ -57,13 +67,10 @@ def get_player_info(playerid: str, soup: bs.BeautifulSoup = None) -> Dict:
return player_info_data


def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = False, pitching_splits: bool = False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict]]:
def soup_to_data(soup: bs.BeautifulSoup, year: int, identifier: str, identifier_label: str):
"""
Returns a dataframe of all split stats for a given player.
If player_info is True, this will also return a dictionary that includes player position, handedness, height, weight, position, and team
splits tables on the bbref site are all within an embedded comment. This finds all the comments
"""
soup = get_split_soup(playerid, year, pitching_splits)
# the splits tables on the bbref site are all within an embedded comment. This finds all the comments
comment = soup.find_all(text=lambda text: isinstance(text, bs.Comment))
raw_data = []
raw_level_data = []
Expand All @@ -85,7 +92,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
level_headings = [th.get_text()
for th in splits[j].find("tr").find_all("th")][:]
level_headings.append('Split Type')
level_headings.append('Player ID')
level_headings.append(identifier_label)
# singles data isn't included in the tables so this appends the column header
level_headings.append('1B')
raw_level_data.append(level_headings)
Expand All @@ -98,7 +105,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
level_cols = [ele.text.strip() for ele in level_cols]
if split_type != "By Inning": # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results
level_cols.append(split_type)
level_cols.append(playerid)
level_cols.append(identifier)
raw_level_data.append([ele for ele in level_cols])
else:
if year == None: # The bbref tables for career splits have one extra preceding th column labeled 'I' that is not used and is not in the single season records
Expand All @@ -108,7 +115,7 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
headings = [th.get_text()
for th in splits[j].find("tr").find_all("th")][:]
headings.append('Split Type')
headings.append('Player ID')
headings.append(identifier_label)
# singles data isn't included in the tables so this appends the column header
headings.append('1B')
raw_data.append(headings)
Expand All @@ -121,18 +128,39 @@ def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = Fa
cols = [ele.text.strip() for ele in cols]
if split_type != "By Inning": # bbref added three empty columns to the by inning tables that don't match the rest of the tables. Not including this split table in results
cols.append(split_type)
cols.append(playerid)
cols.append(identifier)
raw_data.append([ele for ele in cols])

data = pd.DataFrame(raw_data)
data = data.rename(columns=data.iloc[0])
data = data.reindex(data.index.drop(0))
data = data.set_index(['Player ID', 'Split Type', 'Split'])
data = data.set_index([identifier_label, 'Split Type', 'Split'])
data = data.drop(index=['Split'], level=2)
data = data.apply(pd.to_numeric, errors='coerce').convert_dtypes()
data = data.dropna(axis=1, how='all')
data['1B'] = data['H']-data['2B']-data['3B']-data['HR']
data = data.loc[playerid]
data = data.loc[identifier]
return data, raw_level_data


def get_team_splits(team: str, year: int, pitching_splits: bool = False) -> pd.DataFrame:
"""
Returns a dataframe of all split stats for a given team.
"""
soup = get_team_split_soup(team, year, pitching_splits)

data, raw_level_data = soup_to_data(soup, year, team, "Team")

return data

def get_splits(playerid: str, year: Optional[int] = None, player_info: bool = False, pitching_splits: bool = False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, Dict]]:
"""
Returns a dataframe of all split stats for a given player.
If player_info is True, this will also return a dictionary that includes player position, handedness, height, weight, position, and team
"""
soup = get_split_soup(playerid, year, pitching_splits)
# the splits tables on the bbref site are all within an embedded comment. This finds all the comments
data, raw_level_data = soup_to_data(soup, year, playerid, 'Player ID')

if pitching_splits is True: # Returns Game Level tables as a second dataframe for pitching splits
level_data = pd.DataFrame(raw_level_data)
level_data = level_data.rename(columns=level_data.iloc[0])
Expand Down