Skip to content

Commit

Permalink
FIRST web scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
bubner committed Jul 9, 2023
1 parent 275dd13 commit cab0de4
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ build/
.vercel
src/.env
.idea
.vscode
.vscode
src/geckodriver.log
15 changes: 15 additions & 0 deletions src/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from flask_login import current_user, login_required

import db
import scrape
from firebase_instance import auth

api_bp = Blueprint("api", __name__, template_folder="templates")
Expand Down Expand Up @@ -77,3 +78,17 @@ def api_dashboard():
}
]
return should_display

@api_bp.route("/api/get_team_data/<int:team_number>")
def get(team_number: int):
"""
Get data for a FTC/FTC/FLL team number.
"""
data = scrape.get(team_number)
return {
"team_number": team_number,
"valid": data.get("valid"),
"season": data.get("season"),
"data": data.get("data") or {}
}

11 changes: 11 additions & 0 deletions src/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
attrs==23.1.0
blinker==1.6.2
cachetools==5.3.0
certifi==2023.5.7
Expand All @@ -6,6 +7,7 @@ charset-normalizer==3.1.0
click==8.1.3
cryptography==41.0.0
Deprecated==1.2.13
exceptiongroup==1.1.2
firebase-rest-api==1.10.1
Flask==2.3.2
Flask-Login==0.6.2
Expand All @@ -20,12 +22,14 @@ google-resumable-media==2.5.0
googleapis-common-protos==1.59.0
grpcio==1.54.2
grpcio-status==1.54.2
h11==0.14.0
idna==3.4
importlib-metadata==6.6.0
itsdangerous==2.1.2
Jinja2==3.1.2
jwcrypto==1.4.2
MarkupSafe==2.1.2
outcome==1.2.0
Pillow==9.5.0
pkce==1.0.3
proto-plus==1.22.2
Expand All @@ -34,16 +38,23 @@ pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
pypng==0.20220715.0
PySocks==1.7.1
python-dotenv==1.0.0
python-jwt==4.0.0
pytz==2023.3
qrcode==7.4.2
requests==2.31.0
rsa==4.9
selenium==4.10.0
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
trio==0.22.1
trio-websocket==0.10.3
typing_extensions==4.6.2
urllib3==1.26.15
Werkzeug==2.3.4
wrapt==1.15.0
wsproto==1.2.0
WTForms==3.0.1
zipp==3.15.0
90 changes: 90 additions & 0 deletions src/scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""
Web scraper for FTC team number data.
@author: Lucas Bubner, 2023
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from datetime import datetime

def get(team_number: int) -> dict:
"""
Get information about a team number.
"""
tdata = {}

# Create a headless Firefox instance
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

# Try with last year's season
usingThisYear = False
driver.get(_link(str(team_number), -1))

# Search for keywords "No Teams or Events Found"
src = driver.page_source
for _ in range(2):
if "No Teams or Events Found" in src:
# Try with this year's season
driver.get(_link(str(team_number)))
usingThisYear = True
src = driver.page_source
else:
break
else:
# If the loop is not broken, the team number is invalid
return {
"valid": False
}

# Team must exist
season = datetime.now().year if usingThisYear else datetime.now().year - 1
tdata.update({
"valid": True,
"season": f"{season}-{season + 1}",
})

# Get all teams that have returned
data = driver.find_element(By.ID, "dTeamEventResults")

# Remove formatting
data = data.text.replace("\n", " ").replace("\t", " ").replace("\r", " ")

# Extract into multiple teams, if applicable
data = data.split("Team Number")

# Remove the first element, which is the empty header
data.pop(0)

extracted_data = []
for team in data:
teamdata = {}
# Find team nickname by extracting text between Nickname: and Organization(s):
nickname = team.split("Nickname: ")[1].split("Organization(s):")[0]
teamdata.update({"nickname": nickname.strip()})
# Find organizations by extracting text between Organization(s): and Program:
orgs = team.split("Organization(s): ")[1].split("Program:")[0]
teamdata.update({"orgs": orgs.strip()})
# Find program by extracting text between Program: and Location:
program = team.split("Program: ")[1].split("Location:")[0]
teamdata.update({"program": program.strip()})
# Find location by extracting text between Location: and Rookie Year:
location = team.split("Location: ")[1].split("Rookie Year:")[0]
teamdata.update({"location": location.strip()})
# Find rookie year by extracting text after Rookie Year:
rookie_year = team.split("Rookie Year: ")[1]
teamdata.update({"rookie_year": int(rookie_year.strip())})
extracted_data.append(teamdata)

tdata.update({"data": extracted_data})
return tdata



def _link(team_number: str, offset: int = 0) -> str:
"""
FIRST team event search link.
"""
year = datetime.now().year + offset
return f"https://www.firstinspires.org/team-event-search#type=teams&sort=name&keyword={team_number}&programs=FLLJR,FLL,FTC,FRC&year={year}"

0 comments on commit cab0de4

Please sign in to comment.