From 9730373b92e50bc8af954aedaadc86a34a8b366b Mon Sep 17 00:00:00 2001 From: nikhilutexas <97849152+nikhilutexas@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:04:58 -0600 Subject: [PATCH] feat: add UFC scraper that fetches events directly from UFC.com - Scrapes UFC.com events pages for accurate event data - Uses data-timestamp attribute for precise event times - Extracts fighter names, venues, and fight card info - Deduplicates events by URL slug and UID - New endpoint: /ufc/events This replaces TheSportsDB for UFC events which had dates off by ~1 day. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- integrations/ufc.py | 200 ++++++++++++++++++++++++++++++++++++++++++++ main.py | 9 ++ 2 files changed, 209 insertions(+) create mode 100644 integrations/ufc.py diff --git a/integrations/ufc.py b/integrations/ufc.py new file mode 100644 index 0000000..e289da4 --- /dev/null +++ b/integrations/ufc.py @@ -0,0 +1,200 @@ +from typing import List +from datetime import datetime, timedelta +import re + +import requests +from bs4 import BeautifulSoup +from fastapi import HTTPException + +from base import CalendarBase, Event, IntegrationBase + + +UFC_EVENTS_URL = "https://www.ufc.com/events" +UFC_BASE_URL = "https://www.ufc.com" + + +def get_event_urls() -> List[str]: + """Fetch event URLs from UFC.com events listing pages.""" + seen_slugs = set() + event_urls = [] + + for page in range(2): # Pages 0 and 1 + url = f"{UFC_EVENTS_URL}?page={page}" + response = requests.get(url, timeout=30, headers={ + "User-Agent": "Mozilla/5.0 (compatible; Sync2Cal/1.0)" + }) + if response.status_code != 200: + continue + + soup = BeautifulSoup(response.text, "html.parser") + + # Find all event links - they must be on UFC.com and contain /event/ + for link in soup.find_all("a", href=True): + href = link["href"] + + # Skip non-UFC links (e.g., ticketmaster) + if not href.startswith("/") and "ufc.com" not in href: + continue + + if "/event/" not in href: + continue + + # Make absolute URL if needed + if href.startswith("/"): + href = f"{UFC_BASE_URL}{href}" + + # Remove fragment (e.g., #1295) and query string for deduplication + clean_url = href.split("#")[0].split("?")[0] + + # Extract slug for deduplication + slug = clean_url.split("/event/")[-1] + + # Use set to avoid duplicates by slug + if slug not in seen_slugs: + seen_slugs.add(slug) + event_urls.append(clean_url) + + return event_urls + + +def get_event_details(event_url: str) -> dict | None: + """Fetch event details from an individual event page.""" + try: + response = requests.get(event_url, timeout=30, headers={ + "User-Agent": "Mozilla/5.0 (compatible; Sync2Cal/1.0)" + }) + if response.status_code != 200: + return None + + soup = BeautifulSoup(response.text, "html.parser") + + # Get event title - handle the nested structure with fighters + title_el = soup.select_one(".c-hero__headline") + if not title_el: + return None + + # Check if it's a vs matchup (e.g., "Bautista vs Oliveira") + divider = title_el.select_one(".e-divider") + if divider: + top = divider.select_one(".e-divider__top") + bottom = divider.select_one(".e-divider__bottom") + if top and bottom: + title = f"{top.get_text(strip=True)} vs {bottom.get_text(strip=True)}" + else: + title = title_el.get_text(strip=True) + else: + title = title_el.get_text(strip=True) + + # Get headline prefix (e.g., "UFC Fight Night") + prefix_el = soup.select_one(".c-hero__headline-prefix") + if prefix_el: + prefix = prefix_el.get_text(strip=True) + title = f"{prefix}: {title}" + + # Get timestamp from data-timestamp attribute + timestamp_el = soup.select_one(".c-hero__headline-suffix[data-timestamp]") + if not timestamp_el: + # Try alternative selectors + timestamp_el = soup.select_one("[data-timestamp]") + + if not timestamp_el: + return None + + timestamp_str = timestamp_el.get("data-timestamp") + if not timestamp_str: + return None + + # Parse the Unix timestamp (in seconds) + try: + timestamp = int(timestamp_str) + start_time = datetime.utcfromtimestamp(timestamp) + except (ValueError, TypeError): + return None + + # Get venue/location - try multiple selectors + location = "" + for venue_selector in [".field--name-venue", ".c-hero__headline-location", ".c-event-venue"]: + venue_el = soup.select_one(venue_selector) + if venue_el: + # Clean up whitespace in venue text + location = " ".join(venue_el.get_text().split()) + break + + # Build description with fight card + description_parts = [] + + # Main card fights + main_card = soup.select_one("#main-card") + if main_card: + fights = main_card.select(".c-listing-fight") + if fights: + description_parts.append("Main Card:") + for fight in fights[:6]: # Limit to 6 fights + red_corner = fight.select_one(".c-listing-fight__corner--red .c-listing-fight__corner-name") + blue_corner = fight.select_one(".c-listing-fight__corner--blue .c-listing-fight__corner-name") + if red_corner and blue_corner: + description_parts.append(f" {red_corner.get_text(strip=True)} vs {blue_corner.get_text(strip=True)}") + + # Generate a unique ID from the URL + event_slug = event_url.split("/event/")[-1].split("?")[0] + uid = f"ufc-{event_slug}" + + return { + "uid": uid, + "title": title, + "start": start_time, + "location": location, + "description": "\n".join(description_parts) if description_parts else "", + } + + except Exception: + return None + + +class UfcCalendar(CalendarBase): + def fetch_events(self) -> List[Event]: + """Fetch UFC events directly from UFC.com.""" + try: + event_urls = get_event_urls() + events: List[Event] = [] + seen_uids = set() + + for event_url in event_urls: + details = get_event_details(event_url) + if details is None: + continue + + # Skip if we've already seen this event (by UID) + if details["uid"] in seen_uids: + continue + seen_uids.add(details["uid"]) + + # UFC events typically last about 3-4 hours + end_time = details["start"] + timedelta(hours=4) + + events.append( + Event( + uid=details["uid"], + title=details["title"], + start=details["start"], + end=end_time, + all_day=False, + description=details["description"], + location=details["location"], + ) + ) + + # Sort by start time + events.sort(key=lambda e: e.start) + self.events = events + return events + + except HTTPException: + raise + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) from e + + +class UfcIntegration(IntegrationBase): + def fetch_calendars(self, *args, **kwargs): + return None diff --git a/main.py b/main.py index 81ea24e..2b99a68 100644 --- a/main.py +++ b/main.py @@ -14,6 +14,7 @@ from integrations.moviedb import MovieDbIntegration, MovieDbCalendar from integrations.thetvdb import TheTvDbIntegration, TheTvDbCalendar from integrations.wwe import WweIntegration, WweCalendar +from integrations.ufc import UfcIntegration, UfcCalendar from integrations.shows import ShowsIntegration, ShowsCalendar from integrations.releases import ReleasesIntegration, ReleasesCalendar from integrations.sportsdb import SportsDbIntegration, SportsDbCalendar @@ -104,6 +105,14 @@ calendar_class=WweCalendar, multi_calendar=False, ), + UfcIntegration( + id="ufc", + name="UFC", + description="UFC events scraped directly from UFC.com", + base_url="https://www.ufc.com", + calendar_class=UfcCalendar, + multi_calendar=False, + ), ShowsIntegration( id="shows", name="TV Shows",