FIRST web scraper

bubner · Jul 9, 2023 · cab0de4 · cab0de4
1 parent 275dd13
commit cab0de4
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@ build/
 .vercel
 src/.env
 .idea
-.vscode
+.vscode
+src/geckodriver.log
diff --git a/src/api.py b/src/api.py
@@ -9,6 +9,7 @@
 from flask_login import current_user, login_required
 
 import db
+import scrape
 from firebase_instance import auth
 
 api_bp = Blueprint("api", __name__, template_folder="templates")
@@ -77,3 +78,17 @@ def api_dashboard():
             }
         ]
     return should_display
+
+@api_bp.route("/api/get_team_data/<int:team_number>")
+def get(team_number: int):
+    """
+        Get data for a FTC/FTC/FLL team number.
+    """
+    data = scrape.get(team_number)
+    return {
+        "team_number": team_number,
+        "valid": data.get("valid"),
+        "season": data.get("season"),
+        "data": data.get("data") or {}
+    }
+
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -1,3 +1,4 @@
+attrs==23.1.0
 blinker==1.6.2
 cachetools==5.3.0
 certifi==2023.5.7
@@ -6,6 +7,7 @@ charset-normalizer==3.1.0
 click==8.1.3
 cryptography==41.0.0
 Deprecated==1.2.13
+exceptiongroup==1.1.2
 firebase-rest-api==1.10.1
 Flask==2.3.2
 Flask-Login==0.6.2
@@ -20,12 +22,14 @@ google-resumable-media==2.5.0
 googleapis-common-protos==1.59.0
 grpcio==1.54.2
 grpcio-status==1.54.2
+h11==0.14.0
 idna==3.4
 importlib-metadata==6.6.0
 itsdangerous==2.1.2
 Jinja2==3.1.2
 jwcrypto==1.4.2
 MarkupSafe==2.1.2
+outcome==1.2.0
 Pillow==9.5.0
 pkce==1.0.3
 proto-plus==1.22.2
@@ -34,16 +38,23 @@ pyasn1==0.5.0
 pyasn1-modules==0.3.0
 pycparser==2.21
 pypng==0.20220715.0
+PySocks==1.7.1
 python-dotenv==1.0.0
 python-jwt==4.0.0
 pytz==2023.3
 qrcode==7.4.2
 requests==2.31.0
 rsa==4.9
+selenium==4.10.0
 six==1.16.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+trio==0.22.1
+trio-websocket==0.10.3
 typing_extensions==4.6.2
 urllib3==1.26.15
 Werkzeug==2.3.4
 wrapt==1.15.0
+wsproto==1.2.0
 WTForms==3.0.1
 zipp==3.15.0
diff --git a/src/scrape.py b/src/scrape.py
@@ -0,0 +1,90 @@
+"""
+    Web scraper for FTC team number data.
+    @author: Lucas Bubner, 2023
+"""
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
+from datetime import datetime
+
+def get(team_number: int) -> dict:
+    """
+        Get information about a team number.
+    """
+    tdata = {}
+
+    # Create a headless Firefox instance
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    # Try with last year's season
+    usingThisYear = False
+    driver.get(_link(str(team_number), -1))
+
+    # Search for keywords "No Teams or Events Found"
+    src = driver.page_source
+    for _ in range(2):
+        if "No Teams or Events Found" in src:
+            # Try with this year's season
+            driver.get(_link(str(team_number)))
+            usingThisYear = True
+            src = driver.page_source
+        else:
+            break
+    else:
+        # If the loop is not broken, the team number is invalid
+        return {
+            "valid": False
+        }
+
+    # Team must exist
+    season = datetime.now().year if usingThisYear else datetime.now().year - 1
+    tdata.update({
+        "valid": True,
+        "season": f"{season}-{season + 1}",
+    })
+
+    # Get all teams that have returned
+    data = driver.find_element(By.ID, "dTeamEventResults")
+
+    # Remove formatting
+    data = data.text.replace("\n", " ").replace("\t", " ").replace("\r", " ")
+
+    # Extract into multiple teams, if applicable
+    data = data.split("Team Number")
+
+    # Remove the first element, which is the empty header
+    data.pop(0)
+
+    extracted_data = []
+    for team in data:
+        teamdata = {}
+        # Find team nickname by extracting text between Nickname: and Organization(s):
+        nickname = team.split("Nickname: ")[1].split("Organization(s):")[0]
+        teamdata.update({"nickname": nickname.strip()})
+        # Find organizations by extracting text between Organization(s): and Program:
+        orgs = team.split("Organization(s): ")[1].split("Program:")[0]
+        teamdata.update({"orgs": orgs.strip()})
+        # Find program by extracting text between Program: and Location:
+        program = team.split("Program: ")[1].split("Location:")[0]
+        teamdata.update({"program": program.strip()})
+        # Find location by extracting text between Location: and Rookie Year:
+        location = team.split("Location: ")[1].split("Rookie Year:")[0]
+        teamdata.update({"location": location.strip()})
+        # Find rookie year by extracting text after Rookie Year:
+        rookie_year = team.split("Rookie Year: ")[1]
+        teamdata.update({"rookie_year": int(rookie_year.strip())})
+        extracted_data.append(teamdata)
+
+    tdata.update({"data": extracted_data})
+    return tdata
+
+
+
+def _link(team_number: str, offset: int = 0) -> str:
+    """
+        FIRST team event search link.
+    """
+    year = datetime.now().year + offset
+    return f"https://www.firstinspires.org/team-event-search#type=teams&sort=name&keyword={team_number}&programs=FLLJR,FLL,FTC,FRC&year={year}"
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,4 +11,5 @@ build/ @@
     .vercel
     src/.env
     .idea
-    .vscode
+    .vscode
+    src/geckodriver.log