From 21f23d52bd292ee8f184e90e3a675ace1667107b Mon Sep 17 00:00:00 2001
From: mikepal2 <40579649+mikepal2@users.noreply.github.com>
Date: Thu, 30 May 2024 11:58:43 -0700
Subject: [PATCH 1/4] Add seasonal schedules support

---
 src/ferry_planner/schedule.py | 300 ++++++++++++++++++++++++++++------
 1 file changed, 249 insertions(+), 51 deletions(-)

diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py
index ea58d10..695983a 100644
--- a/src/ferry_planner/schedule.py
+++ b/src/ferry_planner/schedule.py
@@ -1,8 +1,9 @@
 # ruff: noqa: DTZ001, DTZ005, DTZ007
 import asyncio
+import itertools
 import os
 import time
-from collections.abc import Iterator, Sequence
+from collections.abc import Sequence
 from datetime import datetime, timedelta
 from pathlib import Path
 from threading import Thread
@@ -24,9 +25,11 @@ class FerrySailing(BaseModel):
     """Duration in seconds."""
     # TODO: price: float  # noqa: FIX002
     """Price in Canadian dollars (CAD)."""
+    notes: Sequence[str] | None
+    """Any notes/comments posted about this sailing"""
 
     def __hash__(self) -> int:
-        return hash((self.departure, self.arrival, self.duration))
+        return hash((self.departure, self.arrival, self.duration, self.notes))
 
 
 class FerrySchedule(BaseModel):
@@ -35,6 +38,8 @@ class FerrySchedule(BaseModel):
     destination: LocationId
     sailings: tuple[FerrySailing, ...]
     url: str
+    notes: Sequence[str] | None
+    """Any notes/comments posted about this schedule"""
 
 
 class ScheduleGetter(Protocol):
@@ -48,6 +53,17 @@ def __call__(
     ) -> FerrySchedule | None: ...
 
 
+class HtmlParseResult:
+    redirect_url: str | None = None
+    sailings: Sequence[FerrySailing] | None = None
+    notes: list[str] | None = None
+
+    def add_note(self, note: str) -> None:
+        if self.notes is None:
+            self.notes = []
+        self.notes.append(note)
+
+
 class ScheduleDB:
     def __init__(  # noqa: PLR0913
         self,
@@ -118,7 +134,7 @@ def put(self, schedule: FerrySchedule, /) -> None:
         dirpath = filepath.parent
         if not dirpath.exists():
             dirpath.mkdir(mode=0o755, parents=True, exist_ok=True)
-        filepath.write_text(schedule.model_dump_json(indent=4), encoding="utf-8")
+        filepath.write_text(schedule.model_dump_json(indent=4, exclude_none=True), encoding="utf-8")
 
     def download_schedule(
         self,
@@ -131,26 +147,45 @@ def download_schedule(
         url = self._get_download_url(origin_id, destination_id, date=date)
         route = f"{origin_id}-{destination_id}"
         print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}")
-        try:
-            response = httpx.get(url)
-        except httpx.ConnectTimeout as exc:
-            print(
-                f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n{exc!r}",
+        max_requests_count = 3
+        requests_count = 0
+        while requests_count < max_requests_count:
+            requests_count += 1
+            try:
+                response = httpx.get(url, follow_redirects=True, timeout=30.0)
+            except httpx.HTTPError as exc:
+                print(
+                    f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n"
+                    f"{exc!r}\n"
+                    f"{url}",
+                )
+                return None
+            if not httpx.codes.is_success(response.status_code):
+                print(
+                    f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}"
+                    f" status {response.status_code}",
+                )
+                return None
+            print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}")
+            result = parse_schedule_html(response, date)
+            if result.redirect_url:
+                url = result.redirect_url
+                continue
+            if result.sailings is None:
+                break
+            return FerrySchedule(
+                date=date,
+                origin=origin_id,
+                destination=destination_id,
+                sailings=tuple(result.sailings),
+                url=url,
+                notes=result.notes,
             )
-            return None
-        if response.status_code in (404, 500):
-            print(f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}")
-            return None
-        doc = response.text.replace("\u2060", "")
-        print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}")
-        sailings = tuple(parse_schedule_html(html=doc, date=date))
-        return FerrySchedule(
-            date=date,
-            origin=origin_id,
-            destination=destination_id,
-            sailings=sailings,
-            url=url,
+        print(
+            f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}"
+            f" - too many redirects",
         )
+        return None
 
     async def download_schedule_async(
         self,
@@ -164,26 +199,45 @@ async def download_schedule_async(
         url = self._get_download_url(origin_id, destination_id, date=date)
         route = f"{origin_id}-{destination_id}"
         print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}")
-        try:
-            response = await client.get(url)
-        except httpx.HTTPError as exc:
-            print(
-                f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n{exc!r}",
+        max_requests_count = 3
+        requests_count = 0
+        while requests_count < max_requests_count:
+            requests_count += 1
+            try:
+                response = await client.get(url, follow_redirects=True, timeout=30.0)
+            except httpx.HTTPError as exc:
+                print(
+                    f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n"
+                    f"{exc!r}\n"
+                    f"{url}",
+                )
+                return None
+            if not httpx.codes.is_success(response.status_code):
+                print(
+                    f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}"
+                    f" status {response.status_code}",
+                )
+                return None
+            print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}")
+            result = parse_schedule_html(response, date)
+            if result.redirect_url:
+                url = result.redirect_url
+                continue
+            if result.sailings is None:
+                break
+            return FerrySchedule(
+                date=date,
+                origin=origin_id,
+                destination=destination_id,
+                sailings=tuple(result.sailings),
+                url=url,
+                notes=result.notes,
             )
-            return None
-        if response.status_code in (404, 500):
-            print(f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}")
-            return None
-        doc = response.text.replace("\u2060", "")
-        print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}")
-        sailings = tuple(parse_schedule_html(html=doc, date=date))
-        return FerrySchedule(
-            date=date,
-            origin=origin_id,
-            destination=destination_id,
-            sailings=sailings,
-            url=url,
+        print(
+            f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}"
+            f" - too many redirects",
         )
+        return None
 
     async def _download_and_save_schedule(
         self,
@@ -218,8 +272,8 @@ async def refresh_cache(self) -> None:
         self._mem_cache = {}
         # download new schedules
         tasks = []
-        timeout = httpx.Timeout(5.0, pool=None)
-        limits = httpx.Limits(max_connections=10)
+        timeout = httpx.Timeout(30.0, pool=None)
+        limits = httpx.Limits(max_connections=5)
         async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
             for connection in self.ferry_connections:
                 for date in dates:
@@ -254,30 +308,77 @@ def _refresh_task(self) -> None:
             time.sleep(self.refresh_interval)
 
 
-def parse_schedule_html(*, html: str, date: datetime) -> Iterator[FerrySailing]:
+def parse_schedule_html(response: httpx.Response, date: datetime) -> HtmlParseResult:
+    result = HtmlParseResult()
+    html = response.text.replace("\u2060", "")
     soup = BeautifulSoup(markup=html, features="html.parser")
-    table = soup.find("table", id="dailyScheduleTableOnward")
-    if not table or not isinstance(table, Tag) or not table.tbody:
-        return
+    table_tag = soup.find("table", id="dailyScheduleTableOnward")
+    daterange_tag = soup.find("div", id="dateRangeModal")  # for seasonal
+    rows: Sequence[Tag] | None = None
+    if table_tag and isinstance(table_tag, Tag) and table_tag.tbody:
+        rows = table_tag.tbody.find_all("tr")
+    elif daterange_tag and isinstance(daterange_tag, Tag):
+        hrefs = [a["href"] for a in daterange_tag.find_all("a")]
+        index = get_seasonal_schedule_daterange_index(hrefs, date)
+        if index < 0:
+            pass  # date is out of range
+        else:
+            url = response.url.scheme + "://" + response.url.host + hrefs[index]
+            if index == 0 or url == str(response.url):
+                rows = get_seasonal_schedule_rows(soup, date)
+            else:
+                result.redirect_url = url
+                return result
+    result.sailings = parse_sailings_from_html_rows(rows, date)
+    if result.sailings is None:
+        for note in [
+            "Seasonal schedules have not been posted for these dates",
+            "Schedules for your selected date and route are currently unavailable",
+        ]:
+            if note in html:
+                result.add_note(note)
+                result.sailings = []
+                return result
+        print(f"No sailings found at {response.url}")
+    return result
+
+
+def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> Sequence[FerrySailing] | None:
+    if rows is None:
+        return None
     sailing_row_min_td_count = 3
-    for row in table.tbody.find_all("tr"):
+    sailings: Sequence[FerrySailing] = []
+    notes = None
+    for row in rows:
         tds = row.find_all("td")
-        if len(tds) < sailing_row_min_td_count:
+        if (
+            len(tds) < sailing_row_min_td_count
+            or "No sailings available" in tds[1].text
+            or "No passengers permitted" in tds[1].text
+        ):
             continue
+        td1 = tds[1].text.strip().split("\n", maxsplit=1)
+        if len(td1) > 1:
+            notes = parse_sailig_comment(td1[1])
+            # assumiing dates are always in the first note
+            if is_schedule_excluded_on_date(notes[0], date):
+                continue
+            notes = [n for n in notes if n]
         departure = datetime.strptime(
-            row.find_all("td")[1].text,
+            td1[0].strip(),
             "%I:%M %p",
         ).replace(year=date.year, month=date.month, day=date.day)
         arrival = datetime.strptime(
-            row.find_all("td")[2].text,
+            row.find_all("td")[2].text.strip(),
             "%I:%M %p",
         ).replace(year=date.year, month=date.month, day=date.day)
         td3 = tds[3].text.strip()
+        td3format = "%Hh %Mm" if "h " in td3 and "m" in td3 else "%Mm" if "m" in td3 else "%Hh"
         duration = int(
             datetime_to_timedelta(
                 datetime.strptime(
                     td3,
-                    "%Hh %Mm",
+                    td3format,
                 ),
             ).total_seconds(),
         )
@@ -285,5 +386,102 @@ def parse_schedule_html(*, html: str, date: datetime) -> Iterator[FerrySailing]:
             departure=departure,
             arrival=arrival,
             duration=duration,
+            notes=notes,
         )
-        yield sailing
+        sailings.append(sailing)
+    return sailings
+
+
+def parse_sailig_comment(comment: str) -> list[str]:
+    notes: list[str] = []
+    comment = comment.strip()
+    notes.append(comment)
+    pos = comment.find("Note:")
+    if pos > 0:
+        notes.append(comment[pos:])
+        comment = comment[:pos].strip()
+    if comment.startswith("Last "):
+        notes.append(comment)
+        comment = ""
+    notes[0] = comment  # replace original with truncated
+    return notes
+
+
+def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[Tag] | None:
+    rows: Sequence[Tag] = []
+    form = soup.find("form", id="seasonalSchedulesForm")
+    if not isinstance(form, Tag):
+        return None
+    weekday_names = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday")
+    weekday = weekday_names[date.weekday()]
+    for thead in form.find_all("thead"):
+        if thead.text.lower().strip().startswith(weekday):
+            rows = [x for x in itertools.takewhile(lambda t: t.name != "thead", thead.next_siblings) if x.name == "tr"]
+            break
+    return rows
+
+
+def get_seasonal_schedule_daterange_index(hrefs: Sequence[str], date: datetime) -> int:
+    for i in range(len(hrefs)):
+        dates = get_seasonal_schedule_daterange_from_url(hrefs[i])
+        if dates and date.date() >= dates[0].date() and date.date() <= dates[1].date():
+            return i
+    return -1
+
+
+def get_seasonal_schedule_daterange_from_url(href: str) -> tuple[datetime, datetime] | None:
+    dates = href.replace("=", "-").replace("_", "-").split("-")[-2:]
+    expected_dates_count = 2
+    if (len(dates)) != expected_dates_count:
+        return None
+    date_from = datetime.strptime(dates[0], "%Y%m%d")
+    date_to = datetime.strptime(dates[1], "%Y%m%d")
+    return (date_from, date_to)
+
+
+def is_schedule_excluded_on_date(schedule_comment: str, date: datetime) -> bool:
+    if not schedule_comment:
+        return False
+    schedule_comment = schedule_comment.strip()
+    if schedule_comment.upper().startswith("ONLY"):
+        return not match_specific_schedule_date(schedule_comment, date)
+    if schedule_comment.upper().startswith(("EXCEPT", "NOT AVAILABLE")):
+        return match_specific_schedule_date(schedule_comment, date)
+    print("Unknown comment: " + schedule_comment)
+    return False
+
+
+def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool:
+    months: Sequence[str] = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
+    month: int | None = None
+    schedule_dates = schedule_dates.upper()
+    for c in [".", "&", " ON ", " ON:"]:
+        schedule_dates = schedule_dates.replace(c, ",")
+    tokens = [x.strip() for x in schedule_dates.split(",")]
+    tokens = [x for x in tokens if x and x not in ["ONLY", "EXCEPT", "NOT AVAILABLE"]]
+    for token in tokens:
+        if token in months:
+            month = months.index(token) + 1
+            continue
+        _date: datetime
+        if token.isnumeric():
+            if not month:
+                print(f"Failed to parse schedule dates: No month for {token} in '{schedule_dates}")
+                return False
+            _date = datetime(year=date.year, month=month, day=int(token))
+        else:
+            dt = token.split(" ")
+            expected_tokens_count = 2
+            if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in months:
+                # 01 JAN, 02 JAN, 05 FEB, 06 FEB
+                _date = datetime(year=date.year, month=months.index(dt[1]) + 1, day=int(dt[0]))
+            elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in months:
+                # Jan 1, 2, Feb 5 & 6
+                month = months.index(dt[0]) + 1
+                _date = datetime(year=date.year, month=month, day=int(dt[1]))
+            else:
+                print(f"Failed to parse schedule dates: Unknown word '{token}' in '{schedule_dates}")
+                break
+        if date.month == _date.month and date.day == _date.day:
+            return True
+    return False

From 36d82036efa006d4892b52ba5bff7f098eca6cce Mon Sep 17 00:00:00 2001
From: mikepal2 <40579649+mikepal2@users.noreply.github.com>
Date: Thu, 30 May 2024 12:29:26 -0700
Subject: [PATCH 2/4] replace range(len(x)) with enumerate()

---
 src/ferry_planner/schedule.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py
index 695983a..07bcb7c 100644
--- a/src/ferry_planner/schedule.py
+++ b/src/ferry_planner/schedule.py
@@ -422,8 +422,8 @@ def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[
 
 
 def get_seasonal_schedule_daterange_index(hrefs: Sequence[str], date: datetime) -> int:
-    for i in range(len(hrefs)):
-        dates = get_seasonal_schedule_daterange_from_url(hrefs[i])
+    for i, href in enumerate(hrefs):
+        dates = get_seasonal_schedule_daterange_from_url(href)
         if dates and date.date() >= dates[0].date() and date.date() <= dates[1].date():
             return i
     return -1

From 9c22a30a02674fa54f2f23e8621e62910c22ef5e Mon Sep 17 00:00:00 2001
From: mikepal2 <40579649+mikepal2@users.noreply.github.com>
Date: Sun, 2 Jun 2024 18:25:45 -0700
Subject: [PATCH 3/4] use exceptions instead of None use single async download
 method

---
 src/ferry_planner/schedule.py | 264 ++++++++++++++++------------------
 1 file changed, 121 insertions(+), 143 deletions(-)

diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py
index 07bcb7c..e20329b 100644
--- a/src/ferry_planner/schedule.py
+++ b/src/ferry_planner/schedule.py
@@ -3,7 +3,7 @@
 import itertools
 import os
 import time
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from datetime import datetime, timedelta
 from pathlib import Path
 from threading import Thread
@@ -17,6 +17,13 @@
 from ferry_planner.location import LocationId
 from ferry_planner.utils import datetime_to_timedelta
 
+MONTHS = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
+WEEKDAY_NAMES = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday")
+NO_SAILINGS_MESSAGES = [
+    "Seasonal schedules have not been posted for these dates",
+    "Schedules for your selected date and route are currently unavailable",
+]
+
 
 class FerrySailing(BaseModel):
     departure: datetime
@@ -25,8 +32,8 @@ class FerrySailing(BaseModel):
     """Duration in seconds."""
     # TODO: price: float  # noqa: FIX002
     """Price in Canadian dollars (CAD)."""
-    notes: Sequence[str] | None
-    """Any notes/comments posted about this sailing"""
+    notes: tuple[str, ...] = ()
+    """Notes or comments posted about this sailing."""
 
     def __hash__(self) -> int:
         return hash((self.departure, self.arrival, self.duration, self.notes))
@@ -38,7 +45,7 @@ class FerrySchedule(BaseModel):
     destination: LocationId
     sailings: tuple[FerrySailing, ...]
     url: str
-    notes: Sequence[str] | None
+    notes: tuple[str, ...]
     """Any notes/comments posted about this schedule"""
 
 
@@ -54,14 +61,29 @@ def __call__(
 
 
 class HtmlParseResult:
-    redirect_url: str | None = None
-    sailings: Sequence[FerrySailing] | None = None
-    notes: list[str] | None = None
+    redirect_url: str = ""
+    sailings: tuple[FerrySailing, ...] = ()
+    notes: tuple[str, ...] = ()
+    """Any notes/comments/errors posted about this schedule"""
+
+    @classmethod
+    def redirect(cls, redirect_url: str) -> "HtmlParseResult":
+        result = HtmlParseResult()
+        result.redirect_url = redirect_url
+        return result
+
+    @classmethod
+    def from_sailings(cls, sailings: Sequence[FerrySailing], notes: Sequence[str]) -> "HtmlParseResult":
+        result = HtmlParseResult()
+        result.sailings = tuple(sailings)
+        result.notes = tuple(notes)
+        return result
+
 
-    def add_note(self, note: str) -> None:
-        if self.notes is None:
-            self.notes = []
-        self.notes.append(note)
+class DownloadScheduleError(Exception):
+    def __init__(self, url: str, msg: str, *args: Iterable) -> None:
+        self.url = url
+        super().__init__(f"Error downloading {url}: {msg}", *args)
 
 
 class ScheduleDB:
@@ -82,6 +104,9 @@ def __init__(  # noqa: PLR0913
         self._refresh_thread = Thread(target=self._refresh_task, daemon=True)
         self._mem_cache = {}
         self.cache_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+        timeout = httpx.Timeout(30.0, pool=None)
+        limits = httpx.Limits(max_connections=5)
+        self._client = httpx.AsyncClient(timeout=timeout, limits=limits, follow_redirects=True)
 
     def _get_download_url(
         self,
@@ -144,48 +169,12 @@ def download_schedule(
         *,
         date: datetime,
     ) -> FerrySchedule | None:
-        url = self._get_download_url(origin_id, destination_id, date=date)
-        route = f"{origin_id}-{destination_id}"
-        print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}")
-        max_requests_count = 3
-        requests_count = 0
-        while requests_count < max_requests_count:
-            requests_count += 1
-            try:
-                response = httpx.get(url, follow_redirects=True, timeout=30.0)
-            except httpx.HTTPError as exc:
-                print(
-                    f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n"
-                    f"{exc!r}\n"
-                    f"{url}",
-                )
-                return None
-            if not httpx.codes.is_success(response.status_code):
-                print(
-                    f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}"
-                    f" status {response.status_code}",
-                )
-                return None
-            print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}")
-            result = parse_schedule_html(response, date)
-            if result.redirect_url:
-                url = result.redirect_url
-                continue
-            if result.sailings is None:
-                break
-            return FerrySchedule(
-                date=date,
-                origin=origin_id,
-                destination=destination_id,
-                sailings=tuple(result.sailings),
-                url=url,
-                notes=result.notes,
-            )
-        print(
-            f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}"
-            f" - too many redirects",
-        )
-        return None
+        coro = self.download_schedule_async(origin_id, destination_id, date=date)
+        try:
+            loop = asyncio.get_running_loop()
+            return loop.run_until_complete(coro)
+        except RuntimeError:
+            return asyncio.run(coro)
 
     async def download_schedule_async(
         self,
@@ -194,37 +183,46 @@ async def download_schedule_async(
         /,
         *,
         date: datetime,
-        client: httpx.AsyncClient,
     ) -> FerrySchedule | None:
+        try:
+            return await self._download_schedule_async(origin_id, destination_id, date=date)
+        except (DownloadScheduleError, httpx.HTTPError) as exc:
+            url = exc.request.url if isinstance(exc, httpx.HTTPError) else exc.url
+            print(
+                f"[{self.__class__.__name__}:ERROR] failed to download schedule: "
+                f"{origin_id}-{destination_id}:{date.date()}\n"
+                f"\t{exc!r}\n"
+                f"\tUrl: {url}",
+            )
+            return None
+
+    async def _download_schedule_async(
+        self,
+        origin_id: LocationId,
+        destination_id: LocationId,
+        /,
+        *,
+        date: datetime,
+    ) -> FerrySchedule:
         url = self._get_download_url(origin_id, destination_id, date=date)
         route = f"{origin_id}-{destination_id}"
         print(f"[{self.__class__.__name__}:INFO] fetching schedule: {route}:{date.date()}")
-        max_requests_count = 3
-        requests_count = 0
-        while requests_count < max_requests_count:
-            requests_count += 1
-            try:
-                response = await client.get(url, follow_redirects=True, timeout=30.0)
-            except httpx.HTTPError as exc:
-                print(
-                    f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}\n"
-                    f"{exc!r}\n"
-                    f"{url}",
-                )
-                return None
+        max_redirects_count = 3
+        redirects = []
+        while True:
+            response = await self._client.get(url)
             if not httpx.codes.is_success(response.status_code):
-                print(
-                    f"[{self.__class__.__name__}:ERROR] schedule not found: {route}:{date.date()}"
-                    f" status {response.status_code}",
-                )
-                return None
+                raise DownloadScheduleError(url, f"Status {response.status_code}")
             print(f"[{self.__class__.__name__}:INFO] fetched schedule: {route}:{date.date()}")
             result = parse_schedule_html(response, date)
             if result.redirect_url:
+                if len(redirects) > max_redirects_count:
+                    raise DownloadScheduleError(url, "Too many redirects")
+                if url in redirects:
+                    raise DownloadScheduleError(url, "Redirects loop")
                 url = result.redirect_url
+                redirects.append(url)
                 continue
-            if result.sailings is None:
-                break
             return FerrySchedule(
                 date=date,
                 origin=origin_id,
@@ -233,11 +231,6 @@ async def download_schedule_async(
                 url=url,
                 notes=result.notes,
             )
-        print(
-            f"[{self.__class__.__name__}:ERROR] failed to download schedule: {route}:{date.date()}"
-            f" - too many redirects",
-        )
-        return None
 
     async def _download_and_save_schedule(
         self,
@@ -246,13 +239,11 @@ async def _download_and_save_schedule(
         /,
         *,
         date: datetime,
-        client: httpx.AsyncClient,
     ) -> bool:
         schedule = await self.download_schedule_async(
             origin_id,
             destination_id,
             date=date,
-            client=client,
         )
         if schedule is not None:
             self.put(schedule)
@@ -272,28 +263,24 @@ async def refresh_cache(self) -> None:
         self._mem_cache = {}
         # download new schedules
         tasks = []
-        timeout = httpx.Timeout(30.0, pool=None)
-        limits = httpx.Limits(max_connections=5)
-        async with httpx.AsyncClient(timeout=timeout, limits=limits) as client:
-            for connection in self.ferry_connections:
-                for date in dates:
-                    filepath = self._get_filepath(
-                        connection.origin.id,
-                        connection.destination.id,
-                        date=date,
-                    )
-                    if not filepath.exists():
-                        tasks.append(
-                            asyncio.create_task(
-                                self._download_and_save_schedule(
-                                    connection.origin.id,
-                                    connection.destination.id,
-                                    date=date,
-                                    client=client,
-                                ),
+        for connection in self.ferry_connections:
+            for date in dates:
+                filepath = self._get_filepath(
+                    connection.origin.id,
+                    connection.destination.id,
+                    date=date,
+                )
+                if not filepath.exists():
+                    tasks.append(
+                        asyncio.create_task(
+                            self._download_and_save_schedule(
+                                connection.origin.id,
+                                connection.destination.id,
+                                date=date,
                             ),
-                        )
-            downloaded_schedules = sum(await asyncio.gather(*tasks))
+                        ),
+                    )
+        downloaded_schedules = sum(await asyncio.gather(*tasks))
         print(
             f"[{self.__class__.__name__}:INFO] finished refreshing cache, "
             f"downloaded {downloaded_schedules} schedules",
@@ -309,47 +296,40 @@ def _refresh_task(self) -> None:
 
 
 def parse_schedule_html(response: httpx.Response, date: datetime) -> HtmlParseResult:
-    result = HtmlParseResult()
     html = response.text.replace("\u2060", "")
     soup = BeautifulSoup(markup=html, features="html.parser")
     table_tag = soup.find("table", id="dailyScheduleTableOnward")
     daterange_tag = soup.find("div", id="dateRangeModal")  # for seasonal
-    rows: Sequence[Tag] | None = None
+    rows: Sequence[Tag] = []
     if table_tag and isinstance(table_tag, Tag) and table_tag.tbody:
         rows = table_tag.tbody.find_all("tr")
     elif daterange_tag and isinstance(daterange_tag, Tag):
         hrefs = [a["href"] for a in daterange_tag.find_all("a")]
         index = get_seasonal_schedule_daterange_index(hrefs, date)
         if index < 0:
-            pass  # date is out of range
-        else:
-            url = response.url.scheme + "://" + response.url.host + hrefs[index]
-            if index == 0 or url == str(response.url):
-                rows = get_seasonal_schedule_rows(soup, date)
-            else:
-                result.redirect_url = url
-                return result
-    result.sailings = parse_sailings_from_html_rows(rows, date)
-    if result.sailings is None:
-        for note in [
-            "Seasonal schedules have not been posted for these dates",
-            "Schedules for your selected date and route are currently unavailable",
-        ]:
-            if note in html:
-                result.add_note(note)
-                result.sailings = []
-                return result
-        print(f"No sailings found at {response.url}")
-    return result
-
-
-def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) -> Sequence[FerrySailing] | None:
-    if rows is None:
-        return None
+            raise DownloadScheduleError(str(response.url), f"Date {date} is out of seasonal schedules range")
+        url = response.url.scheme + "://" + response.url.host + hrefs[index]
+        if index > 0 and url != str(response.url):
+            return HtmlParseResult.redirect(url)
+        rows = get_seasonal_schedule_rows(str(response.url), soup, date)
+    sailings = parse_sailings_from_html_rows(rows, date)
+    notes = []
+    if not sailings:
+        err = "No sailings found"
+        for msg in NO_SAILINGS_MESSAGES:
+            if msg in html:
+                err = msg
+                break
+        notes.append(err)
+        print(f"{err} at {response.url}")
+    return HtmlParseResult.from_sailings(sailings, notes)
+
+
+def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequence[FerrySailing]:
     sailing_row_min_td_count = 3
-    sailings: Sequence[FerrySailing] = []
-    notes = None
+    sailings = []
     for row in rows:
+        notes = []
         tds = row.find_all("td")
         if (
             len(tds) < sailing_row_min_td_count
@@ -386,7 +366,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag] | None, date: datetime) ->
             departure=departure,
             arrival=arrival,
             duration=duration,
-            notes=notes,
+            notes=tuple(notes),
         )
         sailings.append(sailing)
     return sailings
@@ -407,13 +387,12 @@ def parse_sailig_comment(comment: str) -> list[str]:
     return notes
 
 
-def get_seasonal_schedule_rows(soup: BeautifulSoup, date: datetime) -> Sequence[Tag] | None:
+def get_seasonal_schedule_rows(url: str, soup: BeautifulSoup, date: datetime) -> Sequence[Tag]:
     rows: Sequence[Tag] = []
     form = soup.find("form", id="seasonalSchedulesForm")
     if not isinstance(form, Tag):
-        return None
-    weekday_names = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday")
-    weekday = weekday_names[date.weekday()]
+        raise DownloadScheduleError(url, "seasonalSchedulesForm not found")
+    weekday = WEEKDAY_NAMES[date.weekday()]
     for thead in form.find_all("thead"):
         if thead.text.lower().strip().startswith(weekday):
             rows = [x for x in itertools.takewhile(lambda t: t.name != "thead", thead.next_siblings) if x.name == "tr"]
@@ -452,7 +431,6 @@ def is_schedule_excluded_on_date(schedule_comment: str, date: datetime) -> bool:
 
 
 def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool:
-    months: Sequence[str] = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
     month: int | None = None
     schedule_dates = schedule_dates.upper()
     for c in [".", "&", " ON ", " ON:"]:
@@ -460,8 +438,8 @@ def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool:
     tokens = [x.strip() for x in schedule_dates.split(",")]
     tokens = [x for x in tokens if x and x not in ["ONLY", "EXCEPT", "NOT AVAILABLE"]]
     for token in tokens:
-        if token in months:
-            month = months.index(token) + 1
+        if token in MONTHS:
+            month = MONTHS.index(token) + 1
             continue
         _date: datetime
         if token.isnumeric():
@@ -472,12 +450,12 @@ def match_specific_schedule_date(schedule_dates: str, date: datetime) -> bool:
         else:
             dt = token.split(" ")
             expected_tokens_count = 2
-            if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in months:
+            if len(dt) == expected_tokens_count and dt[0].isnumeric() and dt[1] in MONTHS:
                 # 01 JAN, 02 JAN, 05 FEB, 06 FEB
-                _date = datetime(year=date.year, month=months.index(dt[1]) + 1, day=int(dt[0]))
-            elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in months:
+                _date = datetime(year=date.year, month=MONTHS.index(dt[1]) + 1, day=int(dt[0]))
+            elif len(dt) == expected_tokens_count and dt[1].isnumeric() and dt[0] in MONTHS:
                 # Jan 1, 2, Feb 5 & 6
-                month = months.index(dt[0]) + 1
+                month = MONTHS.index(dt[0]) + 1
                 _date = datetime(year=date.year, month=month, day=int(dt[1]))
             else:
                 print(f"Failed to parse schedule dates: Unknown word '{token}' in '{schedule_dates}")

From 870582ac9ae841908278cfe39f03186431311409 Mon Sep 17 00:00:00 2001
From: mikepal2 <40579649+mikepal2@users.noreply.github.com>
Date: Sun, 2 Jun 2024 20:57:52 -0700
Subject: [PATCH 4/4] Apply review suggestions

---
 src/ferry_planner/schedule.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/ferry_planner/schedule.py b/src/ferry_planner/schedule.py
index e20329b..e288dba 100644
--- a/src/ferry_planner/schedule.py
+++ b/src/ferry_planner/schedule.py
@@ -45,8 +45,8 @@ class FerrySchedule(BaseModel):
     destination: LocationId
     sailings: tuple[FerrySailing, ...]
     url: str
-    notes: tuple[str, ...]
-    """Any notes/comments posted about this schedule"""
+    notes: tuple[str, ...] = ()
+    """Notes or comments posted about this schedule."""
 
 
 class ScheduleGetter(Protocol):
@@ -64,7 +64,7 @@ class HtmlParseResult:
     redirect_url: str = ""
     sailings: tuple[FerrySailing, ...] = ()
     notes: tuple[str, ...] = ()
-    """Any notes/comments/errors posted about this schedule"""
+    """Notes or comments posted about this schedule."""
 
     @classmethod
     def redirect(cls, redirect_url: str) -> "HtmlParseResult":
@@ -339,7 +339,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequen
             continue
         td1 = tds[1].text.strip().split("\n", maxsplit=1)
         if len(td1) > 1:
-            notes = parse_sailig_comment(td1[1])
+            notes = parse_sailing_comment(td1[1])
             # assumiing dates are always in the first note
             if is_schedule_excluded_on_date(notes[0], date):
                 continue
@@ -372,7 +372,7 @@ def parse_sailings_from_html_rows(rows: Sequence[Tag], date: datetime) -> Sequen
     return sailings
 
 
-def parse_sailig_comment(comment: str) -> list[str]:
+def parse_sailing_comment(comment: str) -> list[str]:
     notes: list[str] = []
     comment = comment.strip()
     notes.append(comment)