From 9730373b92e50bc8af954aedaadc86a34a8b366b Mon Sep 17 00:00:00 2001
From: nikhilutexas <97849152+nikhilutexas@users.noreply.github.com>
Date: Fri, 6 Feb 2026 00:04:58 -0600
Subject: [PATCH] feat: add UFC scraper that fetches events directly from
 UFC.com

- Scrapes UFC.com events pages for accurate event data
- Uses data-timestamp attribute for precise event times
- Extracts fighter names, venues, and fight card info
- Deduplicates events by URL slug and UID
- New endpoint: /ufc/events

This replaces TheSportsDB for UFC events which had dates off by ~1 day.

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
---
 integrations/ufc.py | 200 ++++++++++++++++++++++++++++++++++++++++++++
 main.py             |   9 ++
 2 files changed, 209 insertions(+)
 create mode 100644 integrations/ufc.py

diff --git a/integrations/ufc.py b/integrations/ufc.py
new file mode 100644
index 0000000..e289da4
--- /dev/null
+++ b/integrations/ufc.py
@@ -0,0 +1,200 @@
+from typing import List
+from datetime import datetime, timedelta
+import re
+
+import requests
+from bs4 import BeautifulSoup
+from fastapi import HTTPException
+
+from base import CalendarBase, Event, IntegrationBase
+
+
+UFC_EVENTS_URL = "https://www.ufc.com/events"
+UFC_BASE_URL = "https://www.ufc.com"
+
+
+def get_event_urls() -> List[str]:
+    """Fetch event URLs from UFC.com events listing pages."""
+    seen_slugs = set()
+    event_urls = []
+
+    for page in range(2):  # Pages 0 and 1
+        url = f"{UFC_EVENTS_URL}?page={page}"
+        response = requests.get(url, timeout=30, headers={
+            "User-Agent": "Mozilla/5.0 (compatible; Sync2Cal/1.0)"
+        })
+        if response.status_code != 200:
+            continue
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Find all event links - they must be on UFC.com and contain /event/
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+
+            # Skip non-UFC links (e.g., ticketmaster)
+            if not href.startswith("/") and "ufc.com" not in href:
+                continue
+
+            if "/event/" not in href:
+                continue
+
+            # Make absolute URL if needed
+            if href.startswith("/"):
+                href = f"{UFC_BASE_URL}{href}"
+
+            # Remove fragment (e.g., #1295) and query string for deduplication
+            clean_url = href.split("#")[0].split("?")[0]
+
+            # Extract slug for deduplication
+            slug = clean_url.split("/event/")[-1]
+
+            # Use set to avoid duplicates by slug
+            if slug not in seen_slugs:
+                seen_slugs.add(slug)
+                event_urls.append(clean_url)
+
+    return event_urls
+
+
+def get_event_details(event_url: str) -> dict | None:
+    """Fetch event details from an individual event page."""
+    try:
+        response = requests.get(event_url, timeout=30, headers={
+            "User-Agent": "Mozilla/5.0 (compatible; Sync2Cal/1.0)"
+        })
+        if response.status_code != 200:
+            return None
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Get event title - handle the nested structure with fighters
+        title_el = soup.select_one(".c-hero__headline")
+        if not title_el:
+            return None
+
+        # Check if it's a vs matchup (e.g., "Bautista vs Oliveira")
+        divider = title_el.select_one(".e-divider")
+        if divider:
+            top = divider.select_one(".e-divider__top")
+            bottom = divider.select_one(".e-divider__bottom")
+            if top and bottom:
+                title = f"{top.get_text(strip=True)} vs {bottom.get_text(strip=True)}"
+            else:
+                title = title_el.get_text(strip=True)
+        else:
+            title = title_el.get_text(strip=True)
+
+        # Get headline prefix (e.g., "UFC Fight Night")
+        prefix_el = soup.select_one(".c-hero__headline-prefix")
+        if prefix_el:
+            prefix = prefix_el.get_text(strip=True)
+            title = f"{prefix}: {title}"
+
+        # Get timestamp from data-timestamp attribute
+        timestamp_el = soup.select_one(".c-hero__headline-suffix[data-timestamp]")
+        if not timestamp_el:
+            # Try alternative selectors
+            timestamp_el = soup.select_one("[data-timestamp]")
+
+        if not timestamp_el:
+            return None
+
+        timestamp_str = timestamp_el.get("data-timestamp")
+        if not timestamp_str:
+            return None
+
+        # Parse the Unix timestamp (in seconds)
+        try:
+            timestamp = int(timestamp_str)
+            start_time = datetime.utcfromtimestamp(timestamp)
+        except (ValueError, TypeError):
+            return None
+
+        # Get venue/location - try multiple selectors
+        location = ""
+        for venue_selector in [".field--name-venue", ".c-hero__headline-location", ".c-event-venue"]:
+            venue_el = soup.select_one(venue_selector)
+            if venue_el:
+                # Clean up whitespace in venue text
+                location = " ".join(venue_el.get_text().split())
+                break
+
+        # Build description with fight card
+        description_parts = []
+
+        # Main card fights
+        main_card = soup.select_one("#main-card")
+        if main_card:
+            fights = main_card.select(".c-listing-fight")
+            if fights:
+                description_parts.append("Main Card:")
+                for fight in fights[:6]:  # Limit to 6 fights
+                    red_corner = fight.select_one(".c-listing-fight__corner--red .c-listing-fight__corner-name")
+                    blue_corner = fight.select_one(".c-listing-fight__corner--blue .c-listing-fight__corner-name")
+                    if red_corner and blue_corner:
+                        description_parts.append(f"  {red_corner.get_text(strip=True)} vs {blue_corner.get_text(strip=True)}")
+
+        # Generate a unique ID from the URL
+        event_slug = event_url.split("/event/")[-1].split("?")[0]
+        uid = f"ufc-{event_slug}"
+
+        return {
+            "uid": uid,
+            "title": title,
+            "start": start_time,
+            "location": location,
+            "description": "\n".join(description_parts) if description_parts else "",
+        }
+
+    except Exception:
+        return None
+
+
+class UfcCalendar(CalendarBase):
+    def fetch_events(self) -> List[Event]:
+        """Fetch UFC events directly from UFC.com."""
+        try:
+            event_urls = get_event_urls()
+            events: List[Event] = []
+            seen_uids = set()
+
+            for event_url in event_urls:
+                details = get_event_details(event_url)
+                if details is None:
+                    continue
+
+                # Skip if we've already seen this event (by UID)
+                if details["uid"] in seen_uids:
+                    continue
+                seen_uids.add(details["uid"])
+
+                # UFC events typically last about 3-4 hours
+                end_time = details["start"] + timedelta(hours=4)
+
+                events.append(
+                    Event(
+                        uid=details["uid"],
+                        title=details["title"],
+                        start=details["start"],
+                        end=end_time,
+                        all_day=False,
+                        description=details["description"],
+                        location=details["location"],
+                    )
+                )
+
+            # Sort by start time
+            events.sort(key=lambda e: e.start)
+            self.events = events
+            return events
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e)) from e
+
+
+class UfcIntegration(IntegrationBase):
+    def fetch_calendars(self, *args, **kwargs):
+        return None
diff --git a/main.py b/main.py
index 81ea24e..2b99a68 100644
--- a/main.py
+++ b/main.py
@@ -14,6 +14,7 @@
 from integrations.moviedb import MovieDbIntegration, MovieDbCalendar
 from integrations.thetvdb import TheTvDbIntegration, TheTvDbCalendar
 from integrations.wwe import WweIntegration, WweCalendar
+from integrations.ufc import UfcIntegration, UfcCalendar
 from integrations.shows import ShowsIntegration, ShowsCalendar
 from integrations.releases import ReleasesIntegration, ReleasesCalendar
 from integrations.sportsdb import SportsDbIntegration, SportsDbCalendar
@@ -104,6 +105,14 @@
         calendar_class=WweCalendar,
         multi_calendar=False,
     ),
+    UfcIntegration(
+        id="ufc",
+        name="UFC",
+        description="UFC events scraped directly from UFC.com",
+        base_url="https://www.ufc.com",
+        calendar_class=UfcCalendar,
+        multi_calendar=False,
+    ),
     ShowsIntegration(
         id="shows",
         name="TV Shows",