Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor pt2 [untested] #4

Merged
merged 8 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12"]
python-version: [ "3.12" ]

steps:
- uses: actions/checkout@v4
Expand All @@ -27,8 +27,9 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest build
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
python -m pip install flake8 pytest build poetry
poetry install
poetry run post-install
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand All @@ -37,7 +38,7 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
if [ -d tests ]; then pytest; fi
if [ -d tests ]; then poetry run pytest; fi
- name: Build package
run: python -m build
- name: Upload build artifacts
Expand Down
17 changes: 8 additions & 9 deletions pyball/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
name = "pyball"
from .playerid_lookup import *
from .utils import *
from .batting_stats import *
from .pitching_stats import *
from .team_batting_stats import *
from .team_pitching_stats import *
from .savant import *
"""
This is the pyball module.
"""

import subprocess


def post_install():
subprocess.run(["playwright", "install"], check=True)
"""
Run the playwright install command.
"""
subprocess.run(["playwright", "install"], check=True)
137 changes: 137 additions & 0 deletions pyball/baseball_reference_player.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# File: baseball_reference_player.py
# Author: Gabriel DiFiore <difioregabe@gmail.com>
# (c) 2022-2024
#
# Description: File containing functions to obtain player stats from Baseball-Reference

from typing import Optional
import logging
import pandas as pd
from bs4 import BeautifulSoup

from pyball.utils import read_url, is_bbref_player_url

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class BaseballReferencePlayerStatsScraper:
"""
A class for scraping player statistics from Baseball-Reference.

This class provides methods to extract batting and pitching statistics
for baseball players from their Baseball-Reference profile pages.

Methods:
--------
batting_stats() -> Optional[pd.DataFrame]:
Retrieves the batting statistics for the player.

pitching_stats() -> Optional[pd.DataFrame]:
Retrieves the pitching statistics for the player.
"""

TABLE_IDS = {
'batting': 'batting_standard',
'pitching': 'pitching_standard'
}

def __init__(self, url: str):
"""
Initializes a new instance of the BaseballReferencePlayerStatsScraper class.

Parameters:
-----------
url : str
The URL of the Baseball-Reference profile page for the player.

Raises:
-------
ValueError:
If the provided URL is invalid.
"""
if not is_bbref_player_url(url):
raise ValueError(f"Invalid player URL: {url}")
self.url = url
self.soup = self._get_soup()

def _get_soup(self) -> Optional[BeautifulSoup]:
"""
Retrieves the BeautifulSoup object for the player's profile page.

Returns:
--------
Optional[BeautifulSoup]:
The BeautifulSoup object representing the player's profile page, or None if retrieval failed.
"""
soup = read_url(self.url)
if soup is None:
logger.warning("Failed to retrieve content from URL: %s", self.url)
return soup

def _find_table(self, table_id: str) -> Optional[BeautifulSoup]:
"""
Finds the HTML table element with the specified ID.

Parameters:
-----------
table_id : str
The ID of the table to find.

Returns:
--------
Optional[BeautifulSoup]:
The BeautifulSoup object representing the found table, or None if not found.
"""
return self.soup.find("table", id=self.TABLE_IDS[table_id])

def _get_dataframe(self, table_id: str) -> Optional[pd.DataFrame]:
"""
Parses the HTML table and returns it as a pandas DataFrame.

Parameters:
-----------
table_id : str
The ID of the table to parse.

Returns:
--------
Optional[pd.DataFrame]:
The parsed table as a pandas DataFrame, or None if parsing failed.
"""
table = self._find_table(table_id)
if table is None:
logger.warning("%s stats table not found for URL: %s", table_id.capitalize(), self.url)
return None

try:
df = pd.read_html(str(table))[0]
return df.dropna(how="all")
except ValueError as e:
logger.error("Error parsing %s stats table (no tables found): %s", table_id, str(e))
return None
except Exception as e:
logger.error("Error parsing %s stats table: %s", table_id, str(e))
return None

def batting_stats(self) -> Optional[pd.DataFrame]:
"""
Retrieves the batting statistics for the player.

Returns:
--------
Optional[pd.DataFrame]:
The batting statistics for the player as a pandas DataFrame, or None if not available.
"""
return self._get_dataframe('batting')

def pitching_stats(self) -> Optional[pd.DataFrame]:
"""
Retrieves the pitching statistics for the player.

Returns:
--------
Optional[pd.DataFrame]:
The pitching statistics for the player as a pandas DataFrame, or None if not available.
"""
return self._get_dataframe('pitching')
90 changes: 90 additions & 0 deletions pyball/baseball_reference_team.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# File: baseball_reference_team.py
# Author: Gabriel DiFiore <difioregabe@gmail.com>
# (c) 2022-2024
#
# Description: File containing functions to obtain team stats from Baseball-Reference

from typing import Optional
import logging
import pandas as pd
from bs4 import BeautifulSoup

from pyball.utils import read_url, is_bbref_team_url

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class BaseballReferenceTeamStatsScraper:
"""
A class for scraping team statistics from Baseball-Reference.

This class provides methods to extract batting and pitching statistics
for baseball teams from their Baseball-Reference pages.

Methods:
--------
batting_stats(self) -> Optional[pd.DataFrame]
Returns the batting stats for the team as a pandas DataFrame.

pitching_stats(self) -> Optional[pd.DataFrame]
Returns the pitching stats for the team as a pandas DataFrame.
"""

TABLE_IDS = {
'batting': 'team_batting',
'pitching': 'team_pitching'
}

def __init__(self, url: str):
if not is_bbref_team_url(url):
raise ValueError(f"Invalid team URL: {url}")
self.url = url
self.soup = self._get_soup()

def _get_soup(self) -> Optional[BeautifulSoup]:
soup = read_url(self.url)
if soup is None:
logger.warning("Failed to retrieve content from URL: %s", self.url)
return soup

def _find_table(self, table_id: str) -> Optional[BeautifulSoup]:
return self.soup.find("table", id=self.TABLE_IDS[table_id])

def _get_dataframe(self, table_id: str) -> Optional[pd.DataFrame]:
table = self._find_table(table_id)
if table is None:
logger.warning("%s stats table not found for URL: %s", table_id.capitalize(), self.url)
return None

try:
df = pd.read_html(str(table))[0]
return df.dropna(how="all")
except ValueError as e:
logger.error("Error parsing %s stats table (no tables found): %s", table_id, str(e))
return None
except Exception as e:
logger.error("Error parsing %s stats table: %s", table_id, str(e))
return None

def batting_stats(self) -> Optional[pd.DataFrame]:
"""
Return the (Baseball-Reference) batting stats for a team as a pandas dataframe

Returns:
--------
Optional[pd.DataFrame]
Contains the batting stats for the team, or None if not available
"""
return self._get_dataframe('batting')

def pitching_stats(self) -> Optional[pd.DataFrame]:
"""
Return the (Baseball-Reference) pitching stats for a team as a pandas dataframe

Returns:
--------
Optional[pd.DataFrame]
Contains the pitching stats for the team, or None if not available
"""
return self._get_dataframe('pitching')
55 changes: 0 additions & 55 deletions pyball/batting_stats.py

This file was deleted.

55 changes: 0 additions & 55 deletions pyball/pitching_stats.py

This file was deleted.

Loading