From 21f23d52bd292ee8f184e90e3a675ace1667107b Mon Sep 17 00:00:00 2001 From: mikepal2 <40579649+mikepal2@users.noreply.github.com> Date: Thu, 30 May 2024 11:58:43 -0700 Subject: [PATCH 1/4] Add seasonal schedules support --- src/ferry_planner/schedule.py | 300 ++++++++++++++++++++++++++++------ 1 file changed, 249 insertions(+), 51 deletions(-) diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py index ea58d10..695983a 100644 --- a/src/ferry_planner/schedule.py +++ b/src/ferry_planner/schedule.py @@ -1,8 +1,9 @@ # ruff: noqa: DTZ001, DTZ005, DTZ007 import asyncio +import itertools import os import time -from collections.abc import Iterator, Sequence +from collections.abc import Sequence from datetime import datetime, timedelta from pathlib import Path from threading import Thread @@ -24,9 +25,11 @@ class FerrySailing(BaseModel): """Duration in seconds.""" # TODO: price: float # noqa: FIX002 """Price in Canadian dollars (CAD).""" + notes: Sequence[str] | None + """Any notes/comments posted about this sailing""" def __hash__(self) -> int: - return hash((self.departure, self.arrival, self.duration)) + return hash((self.departure, self.arrival, self.duration, self.notes)) class FerrySchedule(BaseModel): @@ -35,6 +38,8 @@ class FerrySchedule(BaseModel): destination: LocationId sailings: tuple[FerrySailing, ...] url: str + notes: Sequence[str] | None + """Any notes/comments posted about this schedule""" class ScheduleGetter(Protocol): @@ -48,6 +53,17 @@ def __call__( ) -> FerrySchedule | None: ... +class HtmlParseResult: + redirect_url: str | None = None + sailings: Sequence[FerrySailing] | None = None + notes: list[str] | None = None + + def add_note(self, note: str) -> None: + if self.notes is None: + self.notes = [] + self.notes.append(note) + + class ScheduleDB: def __init__( # noqa: PLR0913 self, @@ -118,7 +134,7 @@ def put(self, schedule: FerrySchedule, /) -> None: dirpath = filepath.parent if not dirpath.exists(): dirpath.mkdir(mode=0o755, parents=True, exist_ok=True) - filepath.write_text(schedule.model_dump_json(indent=4), encoding="utf-8") + filepath.write_text(schedule.model_dump_json(indent=4, exclude_none=True), encoding="utf-8") def download_schedule( self, @@ -131,26 +147,45 @@ def download_schedule( url = self._get_download_url(origin_id, destination_id, date=date) route = f"{origin_id}-{destination_id}" print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}") - try: - response = httpx.get(url) - except httpx.ConnectTimeout as exc: - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n{exc!r}", + max_requests_count = 3 + requests_count = 0 + while requests_count < max_requests_count: + requests_count += 1 + try: + response = httpx.get(url, follow_redirects=True, timeout=30.0) + except httpx.HTTPError as exc: + print( + f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n" + f"{exc!r}\n" + f"{url}", + ) + return None + if not httpx.codes.is_success(response.status_code): + print( + f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}" + f" status {response.status_code}", + ) + return None + print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") + result = parse_schedule_html(response, date) + if result.redirect_url: + url = result.redirect_url + continue + if result.sailings is None: + break + return FerrySchedule( + date=date, + origin=origin_id, + destination=destination_id, + sailings=tuple(result.sailings), + url=url, + notes=result.notes, ) - return None - if response.status_code in (404, 500): - print(f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}") - return None - doc = response.text.replace("\u2060", "") - print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") - sailings = tuple(parse_schedule_html(html=doc, date=date)) - return FerrySchedule( - date=date, - origin=origin_id, - destination=destination_id, - sailings=sailings, - url=url, + print( + f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}" + f" - too many redirects", ) + return None async def download_schedule_async( self, @@ -164,26 +199,45 @@ async def download_schedule_async( url = self._get_download_url(origin_id, destination_id, date=date) route = f"{origin_id}-{destination_id}" print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}") - try: - response = await client.get(url) - except httpx.HTTPError as exc: - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n{exc!r}", + max_requests_count = 3 + requests_count = 0 + while requests_count < max_requests_count: + requests_count += 1 + try: + response = await client.get(url, follow_redirects=True, timeout=30.0) + except httpx.HTTPError as exc: + print( + f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n" + f"{exc!r}\n" + f"{url}", + ) + return None + if not httpx.codes.is_success(response.status_code): + print( + f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}" + f" status {response.status_code}", + ) + return None + print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") + result = parse_schedule_html(response, date) + if result.redirect_url: + url = result.redirect_url + continue + if result.sailings is None: + break + return FerrySchedule( + date=date, + origin=origin_id, + destination=destination_id, + sailings=tuple(result.sailings), + url=url, + notes=result.notes, ) - return None - if response.status_code in (404, 500): - print(f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}") - return None - doc = response.text.replace("\u2060", "") - print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") - sailings = tuple(parse_schedule_html(html=doc, date=date)) - return FerrySchedule( - date=date, - origin=origin_id, - destination=destination_id, - sailings=sailings, - url=url, + print( + f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}" + f" - too many redirects", ) + return None async def _download_and_save_schedule( self, @@ -218,8 +272,8 @@ async def refresh_cache(self) -> None: self._mem_cache = {} # download new schedules tasks = [] - timeout = httpx.Timeout(5.0, pool=None) - limits = httpx.Limits(max_connections=10) + timeout = httpx.Timeout(30.0, pool=None) + limits = httpx.Limits(max_connections=5) async with httpx.AsyncClient(timeout=timeout, limits=limits) as client: for connection in self.ferry_connections: for date in dates: @@ -254,30 +308,77 @@ def _refresh_task(self) -> None: time.sleep(self.refresh_interval) -def parse_schedule_html(*, html: str, date: datetime) -> Iterator[FerrySailing]: +def parse_schedule_html(response: httpx.Response, date: datetime) -> HtmlParseResult: + result = HtmlParseResult() + html = response.text.replace("\u2060", "") soup = BeautifulSoup(markup=html, features="html.parser") - table = soup.find("table", id="dailyScheduleTableOnward") - if not table or not isinstance(table, Tag) or not table.tbody: - return + table_tag = soup.find("table", id="dailyScheduleTableOnward") + daterange_tag = soup.find("div", id="dateRangeModal") # for seasonal + rows: Sequence[Tag] | None = None + if table_tag and isinstance(table_tag, Tag) and table_tag.tbody: + rows = table_tag.tbody.find_all("tr") + elif daterange_tag and isinstance(daterange_tag, Tag): + hrefs = [a["href"] for a in daterange_tag.find_all("a")] + index = get_seasonal_schedule_daterange_index(hrefs, date) + if index < 0: + pass # date is out of range + else: + url = response.url.scheme + "://" + response.url.host + hrefs[index] + if index == 0 or url == str(response.url): + rows = get_seasonal_schedule_rows(soup, date) + else: + result.redirect_url = url + return result + result.sailings = parse_sailings_from_html_rows(rows, date) + if result.sailings is None: + for note in [ + "Seasonal schedules have not been posted for these dates", + "Schedules for your selected date and route are currently unavailable", + ]: + if note in html: + result.add_note(note) + result.sailings = [] + return result + print(f"No sailings found at {response.url}") + return result + + +def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> Sequence[FerrySailing] | None: + if rows is None: + return None sailing_row_min_td_count = 3 - for row in table.tbody.find_all("tr"): + sailings: Sequence[FerrySailing] = [] + notes = None + for row in rows: tds = row.find_all("td") - if len(tds) < sailing_row_min_td_count: + if ( + len(tds) < sailing_row_min_td_count + or "No sailings available" in tds[1].text + or "No passengers permitted" in tds[1].text + ): continue + td1 = tds[1].text.strip().split("\n", maxsplit=1) + if len(td1) > 1: + notes = parse_sailig_comment(td1[1]) + # assumiing dates are always in the first note + if is_schedule_excluded_on_date(notes[0], date): + continue + notes = [n for n in notes if n] departure = datetime.strptime( - row.find_all("td")[1].text, + td1[0].strip(), "%I:%M %p", ).replace(year=date.year, month=date.month, day=date.day) arrival = datetime.strptime( - row.find_all("td")[2].text, + row.find_all("td")[2].text.strip(), "%I:%M %p", ).replace(year=date.year, month=date.month, day=date.day) td3 = tds[3].text.strip() + td3format = "%Hh %Mm" if "h " in td3 and "m" in td3 else "%Mm" if "m" in td3 else "%Hh" duration = int( datetime_to_timedelta( datetime.strptime( td3, - "%Hh %Mm", + td3format, ), ).total_seconds(), ) @@ -285,5 +386,102 @@ def parse_schedule_html(*, html: str, date: datetime) -> Iterator[FerrySailing]: departure=departure, arrival=arrival, duration=duration, + notes=notes, ) - yield sailing + sailings.append(sailing) + return sailings + + +def parse_sailig_comment(comment: str) -> list[str]: + notes: list[str] = [] + comment = comment.strip() + notes.append(comment) + pos = comment.find("Note:") + if pos > 0: + notes.append(comment[pos:]) + comment = comment[:pos].strip() + if comment.startswith("Last "): + notes.append(comment) + comment = "" + notes[0] = comment # replace original with truncated + return notes + + +def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[Tag] | None: + rows: Sequence[Tag] = [] + form = soup.find("form", id="seasonalSchedulesForm") + if not isinstance(form, Tag): + return None + weekday_names = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") + weekday = weekday_names[date.weekday()] + for thead in form.find_all("thead"): + if thead.text.lower().strip().startswith(weekday): + rows = [x for x in itertools.takewhile(lambda t: t.name != "thead", thead.next_siblings) if x.name == "tr"] + break + return rows + + +def get_seasonal_schedule_daterange_index(hrefs: Sequence[str], date: datetime) -> int: + for i in range(len(hrefs)): + dates = get_seasonal_schedule_daterange_from_url(hrefs[i]) + if dates and date.date() >= dates[0].date() and date.date() <= dates[1].date(): + return i + return -1 + + +def get_seasonal_schedule_daterange_from_url(href: str) -> tuple[datetime, datetime] | None: + dates = href.replace("=", "-").replace("_", "-").split("-")[-2:] + expected_dates_count = 2 + if (len(dates)) != expected_dates_count: + return None + date_from = datetime.strptime(dates[0], "%Y%m%d") + date_to = datetime.strptime(dates[1], "%Y%m%d") + return (date_from, date_to) + + +def is_schedule_excluded_on_date(schedule_comment: str, date: datetime) -> bool: + if not schedule_comment: + return False + schedule_comment = schedule_comment.strip() + if schedule_comment.upper().startswith("ONLY"): + return not match_specific_schedule_date(schedule_comment, date) + if schedule_comment.upper().startswith(("EXCEPT", "NOT AVAILABLE")): + return match_specific_schedule_date(schedule_comment, date) + print("Unknown comment: " + schedule_comment) + return False + + +def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: + months: Sequence[str] = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] + month: int | None = None + schedule_dates = schedule_dates.upper() + for c in [".", "&", " ON ", " ON:"]: + schedule_dates = schedule_dates.replace(c, ",") + tokens = [x.strip() for x in schedule_dates.split(",")] + tokens = [x for x in tokens if x and x not in ["ONLY", "EXCEPT", "NOT AVAILABLE"]] + for token in tokens: + if token in months: + month = months.index(token) + 1 + continue + _date: datetime + if token.isnumeric(): + if not month: + print(f"Failed to parse schedule dates: No month for {token} in '{schedule_dates}") + return False + _date = datetime(year=date.year, month=month, day=int(token)) + else: + dt = token.split(" ") + expected_tokens_count = 2 + if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in months: + # 01 JAN, 02 JAN, 05 FEB, 06 FEB + _date = datetime(year=date.year, month=months.index(dt[1]) + 1, day=int(dt[0])) + elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in months: + # Jan 1, 2, Feb 5 & 6 + month = months.index(dt[0]) + 1 + _date = datetime(year=date.year, month=month, day=int(dt[1])) + else: + print(f"Failed to parse schedule dates: Unknown word '{token}' in '{schedule_dates}") + break + if date.month == _date.month and date.day == _date.day: + return True + return False From 36d82036efa006d4892b52ba5bff7f098eca6cce Mon Sep 17 00:00:00 2001 From: mikepal2 <40579649+mikepal2@users.noreply.github.com> Date: Thu, 30 May 2024 12:29:26 -0700 Subject: [PATCH 2/4] replace range(len(x)) with enumerate() --- src/ferry_planner/schedule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py index 695983a..07bcb7c 100644 --- a/src/ferry_planner/schedule.py +++ b/src/ferry_planner/schedule.py @@ -422,8 +422,8 @@ def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[ def get_seasonal_schedule_daterange_index(hrefs: Sequence[str], date: datetime) -> int: - for i in range(len(hrefs)): - dates = get_seasonal_schedule_daterange_from_url(hrefs[i]) + for i, href in enumerate(hrefs): + dates = get_seasonal_schedule_daterange_from_url(href) if dates and date.date() >= dates[0].date() and date.date() <= dates[1].date(): return i return -1 From 9c22a30a02674fa54f2f23e8621e62910c22ef5e Mon Sep 17 00:00:00 2001 From: mikepal2 <40579649+mikepal2@users.noreply.github.com> Date: Sun, 2 Jun 2024 18:25:45 -0700 Subject: [PATCH 3/4] use exceptions instead of None use single async download method --- src/ferry_planner/schedule.py | 264 ++++++++++++++++------------------ 1 file changed, 121 insertions(+), 143 deletions(-) diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py index 07bcb7c..e20329b 100644 --- a/src/ferry_planner/schedule.py +++ b/src/ferry_planner/schedule.py @@ -3,7 +3,7 @@ import itertools import os import time -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from datetime import datetime, timedelta from pathlib import Path from threading import Thread @@ -17,6 +17,13 @@ from ferry_planner.location import LocationId from ferry_planner.utils import datetime_to_timedelta +MONTHS = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] +WEEKDAY_NAMES = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") +NO_SAILINGS_MESSAGES = [ + "Seasonal schedules have not been posted for these dates", + "Schedules for your selected date and route are currently unavailable", +] + class FerrySailing(BaseModel): departure: datetime @@ -25,8 +32,8 @@ class FerrySailing(BaseModel): """Duration in seconds.""" # TODO: price: float # noqa: FIX002 """Price in Canadian dollars (CAD).""" - notes: Sequence[str] | None - """Any notes/comments posted about this sailing""" + notes: tuple[str, ...] = () + """Notes or comments posted about this sailing.""" def __hash__(self) -> int: return hash((self.departure, self.arrival, self.duration, self.notes)) @@ -38,7 +45,7 @@ class FerrySchedule(BaseModel): destination: LocationId sailings: tuple[FerrySailing, ...] url: str - notes: Sequence[str] | None + notes: tuple[str, ...] """Any notes/comments posted about this schedule""" @@ -54,14 +61,29 @@ def __call__( class HtmlParseResult: - redirect_url: str | None = None - sailings: Sequence[FerrySailing] | None = None - notes: list[str] | None = None + redirect_url: str = "" + sailings: tuple[FerrySailing, ...] = () + notes: tuple[str, ...] = () + """Any notes/comments/errors posted about this schedule""" + + @classmethod + def redirect(cls, redirect_url: str) -> "HtmlParseResult": + result = HtmlParseResult() + result.redirect_url = redirect_url + return result + + @classmethod + def from_sailings(cls, sailings: Sequence[FerrySailing], notes: Sequence[str]) -> "HtmlParseResult": + result = HtmlParseResult() + result.sailings = tuple(sailings) + result.notes = tuple(notes) + return result + - def add_note(self, note: str) -> None: - if self.notes is None: - self.notes = [] - self.notes.append(note) +class DownloadScheduleError(Exception): + def __init__(self, url: str, msg: str, *args: Iterable) -> None: + self.url = url + super().__init__(f"Error downloading {url}: {msg}", *args) class ScheduleDB: @@ -82,6 +104,9 @@ def __init__( # noqa: PLR0913 self._refresh_thread = Thread(target=self._refresh_task, daemon=True) self._mem_cache = {} self.cache_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + timeout = httpx.Timeout(30.0, pool=None) + limits = httpx.Limits(max_connections=5) + self._client = httpx.AsyncClient(timeout=timeout, limits=limits, follow_redirects=True) def _get_download_url( self, @@ -144,48 +169,12 @@ def download_schedule( *, date: datetime, ) -> FerrySchedule | None: - url = self._get_download_url(origin_id, destination_id, date=date) - route = f"{origin_id}-{destination_id}" - print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}") - max_requests_count = 3 - requests_count = 0 - while requests_count < max_requests_count: - requests_count += 1 - try: - response = httpx.get(url, follow_redirects=True, timeout=30.0) - except httpx.HTTPError as exc: - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n" - f"{exc!r}\n" - f"{url}", - ) - return None - if not httpx.codes.is_success(response.status_code): - print( - f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}" - f" status {response.status_code}", - ) - return None - print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") - result = parse_schedule_html(response, date) - if result.redirect_url: - url = result.redirect_url - continue - if result.sailings is None: - break - return FerrySchedule( - date=date, - origin=origin_id, - destination=destination_id, - sailings=tuple(result.sailings), - url=url, - notes=result.notes, - ) - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}" - f" - too many redirects", - ) - return None + coro = self.download_schedule_async(origin_id, destination_id, date=date) + try: + loop = asyncio.get_running_loop() + return loop.run_until_complete(coro) + except RuntimeError: + return asyncio.run(coro) async def download_schedule_async( self, @@ -194,37 +183,46 @@ async def download_schedule_async( /, *, date: datetime, - client: httpx.AsyncClient, ) -> FerrySchedule | None: + try: + return await self._download_schedule_async(origin_id, destination_id, date=date) + except (DownloadScheduleError, httpx.HTTPError) as exc: + url = exc.request.url if isinstance(exc, httpx.HTTPError) else exc.url + print( + f"[{self.__class__.__name__}:ERROR] failed to download schedule: " + f"{origin_id}-{destination_id}:{date.date()}\n" + f"\t{exc!r}\n" + f"\tUrl: {url}", + ) + return None + + async def _download_schedule_async( + self, + origin_id: LocationId, + destination_id: LocationId, + /, + *, + date: datetime, + ) -> FerrySchedule: url = self._get_download_url(origin_id, destination_id, date=date) route = f"{origin_id}-{destination_id}" print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}") - max_requests_count = 3 - requests_count = 0 - while requests_count < max_requests_count: - requests_count += 1 - try: - response = await client.get(url, follow_redirects=True, timeout=30.0) - except httpx.HTTPError as exc: - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n" - f"{exc!r}\n" - f"{url}", - ) - return None + max_redirects_count = 3 + redirects = [] + while True: + response = await self._client.get(url) if not httpx.codes.is_success(response.status_code): - print( - f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}" - f" status {response.status_code}", - ) - return None + raise DownloadScheduleError(url, f"Status {response.status_code}") print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}") result = parse_schedule_html(response, date) if result.redirect_url: + if len(redirects) > max_redirects_count: + raise DownloadScheduleError(url, "Too many redirects") + if url in redirects: + raise DownloadScheduleError(url, "Redirects loop") url = result.redirect_url + redirects.append(url) continue - if result.sailings is None: - break return FerrySchedule( date=date, origin=origin_id, @@ -233,11 +231,6 @@ async def download_schedule_async( url=url, notes=result.notes, ) - print( - f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}" - f" - too many redirects", - ) - return None async def _download_and_save_schedule( self, @@ -246,13 +239,11 @@ async def _download_and_save_schedule( /, *, date: datetime, - client: httpx.AsyncClient, ) -> bool: schedule = await self.download_schedule_async( origin_id, destination_id, date=date, - client=client, ) if schedule is not None: self.put(schedule) @@ -272,28 +263,24 @@ async def refresh_cache(self) -> None: self._mem_cache = {} # download new schedules tasks = [] - timeout = httpx.Timeout(30.0, pool=None) - limits = httpx.Limits(max_connections=5) - async with httpx.AsyncClient(timeout=timeout, limits=limits) as client: - for connection in self.ferry_connections: - for date in dates: - filepath = self._get_filepath( - connection.origin.id, - connection.destination.id, - date=date, - ) - if not filepath.exists(): - tasks.append( - asyncio.create_task( - self._download_and_save_schedule( - connection.origin.id, - connection.destination.id, - date=date, - client=client, - ), + for connection in self.ferry_connections: + for date in dates: + filepath = self._get_filepath( + connection.origin.id, + connection.destination.id, + date=date, + ) + if not filepath.exists(): + tasks.append( + asyncio.create_task( + self._download_and_save_schedule( + connection.origin.id, + connection.destination.id, + date=date, ), - ) - downloaded_schedules = sum(await asyncio.gather(*tasks)) + ), + ) + downloaded_schedules = sum(await asyncio.gather(*tasks)) print( f"[{self.__class__.__name__}:INFO] finished refreshing cache, " f"downloaded {downloaded_schedules} schedules", @@ -309,47 +296,40 @@ def _refresh_task(self) -> None: def parse_schedule_html(response: httpx.Response, date: datetime) -> HtmlParseResult: - result = HtmlParseResult() html = response.text.replace("\u2060", "") soup = BeautifulSoup(markup=html, features="html.parser") table_tag = soup.find("table", id="dailyScheduleTableOnward") daterange_tag = soup.find("div", id="dateRangeModal") # for seasonal - rows: Sequence[Tag] | None = None + rows: Sequence[Tag] = [] if table_tag and isinstance(table_tag, Tag) and table_tag.tbody: rows = table_tag.tbody.find_all("tr") elif daterange_tag and isinstance(daterange_tag, Tag): hrefs = [a["href"] for a in daterange_tag.find_all("a")] index = get_seasonal_schedule_daterange_index(hrefs, date) if index < 0: - pass # date is out of range - else: - url = response.url.scheme + "://" + response.url.host + hrefs[index] - if index == 0 or url == str(response.url): - rows = get_seasonal_schedule_rows(soup, date) - else: - result.redirect_url = url - return result - result.sailings = parse_sailings_from_html_rows(rows, date) - if result.sailings is None: - for note in [ - "Seasonal schedules have not been posted for these dates", - "Schedules for your selected date and route are currently unavailable", - ]: - if note in html: - result.add_note(note) - result.sailings = [] - return result - print(f"No sailings found at {response.url}") - return result - - -def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> Sequence[FerrySailing] | None: - if rows is None: - return None + raise DownloadScheduleError(str(response.url), f"Date {date} is out of seasonal schedules range") + url = response.url.scheme + "://" + response.url.host + hrefs[index] + if index > 0 and url != str(response.url): + return HtmlParseResult.redirect(url) + rows = get_seasonal_schedule_rows(str(response.url), soup, date) + sailings = parse_sailings_from_html_rows(rows, date) + notes = [] + if not sailings: + err = "No sailings found" + for msg in NO_SAILINGS_MESSAGES: + if msg in html: + err = msg + break + notes.append(err) + print(f"{err} at {response.url}") + return HtmlParseResult.from_sailings(sailings, notes) + + +def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequence[FerrySailing]: sailing_row_min_td_count = 3 - sailings: Sequence[FerrySailing] = [] - notes = None + sailings = [] for row in rows: + notes = [] tds = row.find_all("td") if ( len(tds) < sailing_row_min_td_count @@ -386,7 +366,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> departure=departure, arrival=arrival, duration=duration, - notes=notes, + notes=tuple(notes), ) sailings.append(sailing) return sailings @@ -407,13 +387,12 @@ def parse_sailig_comment(comment: str) -> list[str]: return notes -def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[Tag] | None: +def get_seasonal_schedule_rows(url: str, soup: BeautifulSoup, date: datetime) -> Sequence[Tag]: rows: Sequence[Tag] = [] form = soup.find("form", id="seasonalSchedulesForm") if not isinstance(form, Tag): - return None - weekday_names = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday") - weekday = weekday_names[date.weekday()] + raise DownloadScheduleError(url, "seasonalSchedulesForm not found") + weekday = WEEKDAY_NAMES[date.weekday()] for thead in form.find_all("thead"): if thead.text.lower().strip().startswith(weekday): rows = [x for x in itertools.takewhile(lambda t: t.name != "thead", thead.next_siblings) if x.name == "tr"] @@ -452,7 +431,6 @@ def is_schedule_excluded_on_date(schedule_comment: str, date: datetime) -> bool: def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: - months: Sequence[str] = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] month: int | None = None schedule_dates = schedule_dates.upper() for c in [".", "&", " ON ", " ON:"]: @@ -460,8 +438,8 @@ def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: tokens = [x.strip() for x in schedule_dates.split(",")] tokens = [x for x in tokens if x and x not in ["ONLY", "EXCEPT", "NOT AVAILABLE"]] for token in tokens: - if token in months: - month = months.index(token) + 1 + if token in MONTHS: + month = MONTHS.index(token) + 1 continue _date: datetime if token.isnumeric(): @@ -472,12 +450,12 @@ def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool: else: dt = token.split(" ") expected_tokens_count = 2 - if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in months: + if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in MONTHS: # 01 JAN, 02 JAN, 05 FEB, 06 FEB - _date = datetime(year=date.year, month=months.index(dt[1]) + 1, day=int(dt[0])) - elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in months: + _date = datetime(year=date.year, month=MONTHS.index(dt[1]) + 1, day=int(dt[0])) + elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in MONTHS: # Jan 1, 2, Feb 5 & 6 - month = months.index(dt[0]) + 1 + month = MONTHS.index(dt[0]) + 1 _date = datetime(year=date.year, month=month, day=int(dt[1])) else: print(f"Failed to parse schedule dates: Unknown word '{token}' in '{schedule_dates}") From 870582ac9ae841908278cfe39f03186431311409 Mon Sep 17 00:00:00 2001 From: mikepal2 <40579649+mikepal2@users.noreply.github.com> Date: Sun, 2 Jun 2024 20:57:52 -0700 Subject: [PATCH 4/4] Apply review suggestions --- src/ferry_planner/schedule.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py index e20329b..e288dba 100644 --- a/src/ferry_planner/schedule.py +++ b/src/ferry_planner/schedule.py @@ -45,8 +45,8 @@ class FerrySchedule(BaseModel): destination: LocationId sailings: tuple[FerrySailing, ...] url: str - notes: tuple[str, ...] - """Any notes/comments posted about this schedule""" + notes: tuple[str, ...] = () + """Notes or comments posted about this schedule.""" class ScheduleGetter(Protocol): @@ -64,7 +64,7 @@ class HtmlParseResult: redirect_url: str = "" sailings: tuple[FerrySailing, ...] = () notes: tuple[str, ...] = () - """Any notes/comments/errors posted about this schedule""" + """Notes or comments posted about this schedule.""" @classmethod def redirect(cls, redirect_url: str) -> "HtmlParseResult": @@ -339,7 +339,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequen continue td1 = tds[1].text.strip().split("\n", maxsplit=1) if len(td1) > 1: - notes = parse_sailig_comment(td1[1]) + notes = parse_sailing_comment(td1[1]) # assumiing dates are always in the first note if is_schedule_excluded_on_date(notes[0], date): continue @@ -372,7 +372,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequen return sailings -def parse_sailig_comment(comment: str) -> list[str]: +def parse_sailing_comment(comment: str) -> list[str]: notes: list[str] = [] comment = comment.strip() notes.append(comment)