From fc93c11d3c45c91a5ebe2a969c3b1ded682e837e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakob=20K=C3=B6rber?= <56073945+jakobkoerber@users.noreply.github.com> Date: Wed, 11 Sep 2024 20:04:40 +0200 Subject: [PATCH 1/4] Refactor Studentenwerkmenuparser to Use Overview Page Instead of Detail Page --- src/menu_parser.py | 92 +++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 54 deletions(-) diff --git a/src/menu_parser.py b/src/menu_parser.py index 67c27e98a..85a907aa5 100644 --- a/src/menu_parser.py +++ b/src/menu_parser.py @@ -192,8 +192,8 @@ def __init__(self, students: float, staff: float, guests: float): @staticmethod def __get_self_service_prices( - base_price_type: SelfServiceBasePriceType, - price_per_unit_type: SelfServicePricePerUnitType, + base_price_type: SelfServiceBasePriceType, + price_per_unit_type: SelfServicePricePerUnitType, ) -> Prices: students: Price = Price( base_price_type.price[0], @@ -242,64 +242,48 @@ def __get_price(canteen: Canteen, dish: Tuple[str, str, str, str, str], dish_nam return StudentenwerkMenuParser.__get_self_service_prices(base_price_type, price_per_unit_type) base_url: str = "http://www.studierendenwerk-muenchen-oberbayern.de/mensa/speiseplan/speiseplan_{url_id}_-de.html" - base_url_with_date: str = ( - "http://www.studierendenwerk-muenchen-oberbayern.de/mensa/speiseplan/speiseplan_{date}_{url_id}_-de.html" - ) def parse(self, canteen: Canteen) -> Optional[Dict[datetime.date, Menu]]: menus = {} - for date in self.__get_available_dates(canteen): - page_link: str = self.base_url_with_date.format(url_id=canteen.url_id, date=date.strftime("%Y-%m-%d")) - page: requests.Response = requests.get(page_link, timeout=10.0) - if page.ok: - try: - tree: html.Element = html.fromstring(page.content) - menu = self.get_menu(tree, canteen, date) + page_link: str = self.base_url.format(url_id=canteen.url_id) + page: requests.Response = requests.get(page_link, timeout=10.0) + if page.ok: + try: + tree: html.Element = html.fromstring(page.content) + html_menus: List[html.Element] = self.__get_daily_menus_as_html(tree) + for html_menu in html_menus: + html_menu = html.fromstring(html.tostring(html_menu)) + menu = self.get_menu(html_menu, canteen) if menu: - menus[date] = menu + menus[menu.menu_date] = menu # pylint: disable=broad-except - except Exception as e: - print(f"Exception while parsing menu from {date}. Skipping current date. Exception args: {e.args}") - # pylint: enable=broad-except + except Exception as e: + print(f"Exception while parsing menu. Skipping current date. Exception args: {e.args}") + # pylint: enable=broad-except return menus - def get_menu(self, page: html.Element, canteen: Canteen, date: datetime.date) -> Optional[Menu]: - # get current menu - current_menu: html.Element = self.__get_daily_menus_as_html(page)[0] - # get html representation of menu - menu_html = html.fromstring(html.tostring(current_menu)) - + def get_menu(self, page: html.Element, canteen: Canteen) -> Optional[Menu]: + # extract date + date = self.extract_date_from_html(page) # parse dishes of current menu - dishes: List[Dish] = self.__parse_dishes(menu_html, canteen) + dishes: List[Dish] = self.__parse_dishes(page, canteen) # create menu object menu: Menu = Menu(date, dishes) return menu - def __get_available_dates(self, canteen: Canteen) -> List[datetime.date]: - page_link: str = self.base_url.format(url_id=canteen.url_id) - page: requests.Response = requests.get(page_link, timeout=10.0) - tree: html.Element = html.fromstring(page.content) - return self.get_available_dates_for_html(tree) - # public for testing - def get_available_dates_for_html(self, tree: html.Element) -> List[datetime.date]: - dates: List[datetime.date] = [] - date_strings: List[str] = tree.xpath("//div[@class='c-schedule__item']//strong/text()") - for date_str in date_strings: - # parse date - try: - date: datetime.date = util.parse_date(date_str) - except ValueError: - print(f"Warning: Error during parsing date from html page. Problematic date: {date_str}") - # continue and parse subsequent menus - continue - dates += [date] - return dates + def extract_date_from_html(self, tree: html.Element) -> Optional[datetime.date]: + date_str: str = tree.xpath("//div[@class='c-schedule__item']//strong/text()")[0] + try: + date: datetime.date = util.parse_date(date_str) + return date + except ValueError: + print(f"Warning: Error during parsing date from html page. Problematic date: {date_str}") @staticmethod - def __get_daily_menus_as_html(page): + def __get_daily_menus_as_html(tree: html.Element) -> List[html.Element]: # obtain all daily menus found in the passed html page by xpath query - daily_menus: page.xpath = page.xpath("//div[@class='c-schedule__item']") # type: ignore + daily_menus: List[html.Element] = tree.xpath("//div[@class='c-schedule__item']") # type: ignore return daily_menus @staticmethod @@ -344,12 +328,12 @@ def __parse_dishes(menu_html: html.Element, canteen: Canteen) -> List[Dish]: dish_markers_meetless, ) for ( - dish_name, - dish_type, - dish_marker_additional, - dish_marker_allergen, - dish_marker_type, - dish_marker_meetless, + dish_name, + dish_type, + dish_marker_additional, + dish_marker_allergen, + dish_marker_type, + dish_marker_meetless, ) in dishes_tup: dishes_dict[dish_name] = ( dish_type, @@ -551,7 +535,7 @@ def __get_label_str_and_price(self, column_index: int, line: str) -> Optional[Tu # However, according to # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated, # this is against PEP8 - line[estimated_column_end - delta : min(estimated_column_end + delta, len(line))], # noqa: E203 + line[estimated_column_end - delta: min(estimated_column_end + delta, len(line))], # noqa: E203 )[0] except IndexError: return None @@ -563,7 +547,7 @@ def __get_label_str_and_price(self, column_index: int, line: str) -> Optional[Tu # However, according to # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated, # this is against PEP8 - line[max(estimated_column_begin - delta, 0) : estimated_column_begin + delta], # noqa: E203 + line[max(estimated_column_begin - delta, 0): estimated_column_begin + delta], # noqa: E203 )[0] except IndexError: labels_str = "" @@ -692,7 +676,7 @@ def get_menus(self, text, year, week_number): positions4 = [ (max(a.start() - 3, 0), a.end()) for a in list(re.finditer(self.split_days_regex_closed, soup_line1)) - + list(re.finditer(self.split_days_regex_closed, soup_line2)) + + list(re.finditer(self.split_days_regex_closed, soup_line2)) ] if positions3: # Two lines "Tagessuppe siehe Aushang" @@ -718,7 +702,7 @@ def get_menus(self, text, year, week_number): lines_weekdays = {"mon": "", "tue": "", "wed": "", "thu": "", "fri": ""} # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the # soups) instead of the first menu, if there is a day where the bistro is closed. - for line in lines[soup_line_index + 3 :]: # noqa: E203 + for line in lines[soup_line_index + 3:]: # noqa: E203 lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace("\n", " ") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace("\n", " ") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace("\n", " ") From 4114cc37e3337a19613a2399b46255bba70017e3 Mon Sep 17 00:00:00 2001 From: Friendly-Banana <69007475+Friendly-Banana@users.noreply.github.com> Date: Thu, 19 Sep 2024 23:54:30 +0200 Subject: [PATCH 2/4] solve linting issues and fix test --- src/menu_parser.py | 59 ++++++++++++++++++------------------ src/test/test_menu_parser.py | 10 ++++-- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/src/menu_parser.py b/src/menu_parser.py index 85a907aa5..1acfb30de 100644 --- a/src/menu_parser.py +++ b/src/menu_parser.py @@ -29,7 +29,7 @@ class MenuParser(ABC): """ canteens: Set[Canteen] - _label_lookup: Dict[str, Set[Label]] + _label_subclasses: Dict[str, Set[Label]] # we use datetime %u, so we go from 1-7 weekday_positions: Dict[str, int] = {"mon": 1, "tue": 2, "wed": 3, "thu": 4, "fri": 5, "sat": 6, "sun": 7} @@ -54,7 +54,7 @@ def _parse_label(cls, labels_str: str) -> Set[Label]: for value in split_values: stripped = value.strip() if not stripped.isspace(): - labels |= cls._label_lookup.get(stripped, set()) + labels |= cls._label_subclasses.get(stripped, set()) Label.add_supertype_labels(labels) return labels @@ -112,7 +112,7 @@ def __init__(self, students: float, staff: float, guests: float): self.guests = guests self.unit = "100g" - _label_lookup: Dict[str, Set[Label]] = { + _label_subclasses: Dict[str, Set[Label]] = { "GQB": {Label.BAVARIA}, "MSC": {Label.MSC}, "1": {Label.DYESTUFF}, @@ -192,8 +192,8 @@ def __init__(self, students: float, staff: float, guests: float): @staticmethod def __get_self_service_prices( - base_price_type: SelfServiceBasePriceType, - price_per_unit_type: SelfServicePricePerUnitType, + base_price_type: SelfServiceBasePriceType, + price_per_unit_type: SelfServicePricePerUnitType, ) -> Prices: students: Price = Price( base_price_type.price[0], @@ -241,7 +241,7 @@ def __get_price(canteen: Canteen, dish: Tuple[str, str, str, str, str], dish_nam base_price_type = StudentenwerkMenuParser.SelfServiceBasePriceType.PIZZA_VEGIE return StudentenwerkMenuParser.__get_self_service_prices(base_price_type, price_per_unit_type) - base_url: str = "http://www.studierendenwerk-muenchen-oberbayern.de/mensa/speiseplan/speiseplan_{url_id}_-de.html" + base_url: str = "https://www.studierendenwerk-muenchen-oberbayern.de/mensa/speiseplan/speiseplan_{url_id}_-de.html" def parse(self, canteen: Canteen) -> Optional[Dict[datetime.date, Menu]]: menus = {} @@ -250,8 +250,10 @@ def parse(self, canteen: Canteen) -> Optional[Dict[datetime.date, Menu]]: if page.ok: try: tree: html.Element = html.fromstring(page.content) - html_menus: List[html.Element] = self.__get_daily_menus_as_html(tree) + html_menus: List[html.Element] = self.get_daily_menus_as_html(tree) for html_menu in html_menus: + # this solves some weird reference? issue where tree.xpath will subsequently always use + # the first element despite looping through seemingly separate elements html_menu = html.fromstring(html.tostring(html_menu)) menu = self.get_menu(html_menu, canteen) if menu: @@ -263,27 +265,27 @@ def parse(self, canteen: Canteen) -> Optional[Dict[datetime.date, Menu]]: return menus def get_menu(self, page: html.Element, canteen: Canteen) -> Optional[Menu]: - # extract date date = self.extract_date_from_html(page) - # parse dishes of current menu dishes: List[Dish] = self.__parse_dishes(page, canteen) - # create menu object menu: Menu = Menu(date, dishes) return menu # public for testing - def extract_date_from_html(self, tree: html.Element) -> Optional[datetime.date]: + @staticmethod + def extract_date_from_html(tree: html.Element) -> Optional[datetime.date]: date_str: str = tree.xpath("//div[@class='c-schedule__item']//strong/text()")[0] try: date: datetime.date = util.parse_date(date_str) return date except ValueError: - print(f"Warning: Error during parsing date from html page. Problematic date: {date_str}") + warn(f"Error during parsing date from html page. Problematic date: {date_str}") + return None + # public for testing @staticmethod - def __get_daily_menus_as_html(tree: html.Element) -> List[html.Element]: + def get_daily_menus_as_html(tree: html.Element) -> List[html.Element]: # obtain all daily menus found in the passed html page by xpath query - daily_menus: List[html.Element] = tree.xpath("//div[@class='c-schedule__item']") # type: ignore + daily_menus: List[html.Element] = tree.xpath("//div[@class='c-schedule__item']") return daily_menus @staticmethod @@ -328,12 +330,12 @@ def __parse_dishes(menu_html: html.Element, canteen: Canteen) -> List[Dish]: dish_markers_meetless, ) for ( - dish_name, - dish_type, - dish_marker_additional, - dish_marker_allergen, - dish_marker_type, - dish_marker_meetless, + dish_name, + dish_type, + dish_marker_additional, + dish_marker_allergen, + dish_marker_type, + dish_marker_meetless, ) in dishes_tup: dishes_dict[dish_name] = ( dish_type, @@ -389,8 +391,7 @@ class DishType(Enum): VEGETARIAN = auto() VEGAN = auto() - # if an label is a subclass of another label, - _label_lookup: Dict[str, Set[Label]] = { + _label_subclasses: Dict[str, Set[Label]] = { "a": {Label.GLUTEN}, "aW": {Label.WHEAT}, "aR": {Label.RYE}, @@ -535,7 +536,7 @@ def __get_label_str_and_price(self, column_index: int, line: str) -> Optional[Tu # However, according to # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated, # this is against PEP8 - line[estimated_column_end - delta: min(estimated_column_end + delta, len(line))], # noqa: E203 + line[estimated_column_end - delta : min(estimated_column_end + delta, len(line))], # noqa: E203 )[0] except IndexError: return None @@ -547,7 +548,7 @@ def __get_label_str_and_price(self, column_index: int, line: str) -> Optional[Tu # However, according to # https://black.readthedocs.io/en/stable/faq.html#why-are-flake8-s-e203-and-w503-violated, # this is against PEP8 - line[max(estimated_column_begin - delta, 0): estimated_column_begin + delta], # noqa: E203 + line[max(estimated_column_begin - delta, 0) : estimated_column_begin + delta], # noqa: E203 )[0] except IndexError: labels_str = "" @@ -676,7 +677,7 @@ def get_menus(self, text, year, week_number): positions4 = [ (max(a.start() - 3, 0), a.end()) for a in list(re.finditer(self.split_days_regex_closed, soup_line1)) - + list(re.finditer(self.split_days_regex_closed, soup_line2)) + + list(re.finditer(self.split_days_regex_closed, soup_line2)) ] if positions3: # Two lines "Tagessuppe siehe Aushang" @@ -702,7 +703,7 @@ def get_menus(self, text, year, week_number): lines_weekdays = {"mon": "", "tue": "", "wed": "", "thu": "", "fri": ""} # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the # soups) instead of the first menu, if there is a day where the bistro is closed. - for line in lines[soup_line_index + 3:]: # noqa: E203 + for line in lines[soup_line_index + 3 :]: # noqa: E203 lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace("\n", " ") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace("\n", " ") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace("\n", " ") @@ -743,7 +744,7 @@ def get_menus(self, text, year, week_number): try: price_obj = Price(float(price_str)) except ValueError: - print(f"Warning: Error during parsing price: {price_str}") + warn(f"Error during parsing price: {price_str}") dishes.append( Dish( dish_name.strip(), @@ -770,7 +771,7 @@ class MedizinerMensaMenuParser(MenuParser): labels_regex = r"(\s([A-C]|[E-H]|[K-P]|[R-Z]|[1-9])(,([A-C]|[E-H]|[K-P]|[R-Z]|[1-9]))*(\s|\Z))" price_regex = r"(\d+(,(\d){2})\s?€)" - _label_lookup: Dict[str, Set[Label]] = { + _label_subclasses: Dict[str, Set[Label]] = { "1": {Label.DYESTUFF}, "2": {Label.PRESERVATIVES}, "3": {Label.ANTIOXIDANTS}, @@ -970,7 +971,7 @@ class StraubingMensaMenuParser(MenuParser): url = "https://www.stwno.de/infomax/daten-extern/csv/HS-SR/{calendar_week}.csv" canteens = {Canteen.MENSA_STRAUBING} - _label_lookup: Dict[str, Set[Label]] = { + _label_subclasses: Dict[str, Set[Label]] = { "1": {Label.DYESTUFF}, "2": {Label.PRESERVATIVES}, "3": {Label.ANTIOXIDANTS}, diff --git a/src/test/test_menu_parser.py b/src/test/test_menu_parser.py index e98021e07..d6a9123d9 100644 --- a/src/test/test_menu_parser.py +++ b/src/test/test_menu_parser.py @@ -4,7 +4,7 @@ import tempfile import unittest from datetime import date -from typing import Dict, List +from typing import Dict from lxml import html # nosec: https://github.com/TUM-Dev/eat-api/issues/19 @@ -58,11 +58,15 @@ def test_get_all_dates(self) -> None: working_days.append(start_date) start_date += datetime.timedelta(days=1) + dates = [] tree = file_util.load_html( f"{self.base_path_canteen.format(canteen=Canteen.MENSA_GARCHING.canteen_id)}" f"/for-generation/overview.html", ) - dates: List[date] = self.studentenwerk_menu_parser.get_available_dates_for_html(tree) + menus = StudentenwerkMenuParser.get_daily_menus_as_html(tree) + for menu in menus: + html_menu = html.fromstring(html.tostring(menu)) + dates.append(self.studentenwerk_menu_parser.extract_date_from_html(html_menu)) self.assertEqual(dates, working_days) def test_studentenwerk(self) -> None: @@ -92,7 +96,7 @@ def __get_menus(self, canteen: Canteen) -> Dict[date, Menu]: f"{self.base_path_canteen.format(canteen=canteen.canteen_id)}/for-generation/{date_}.html", ) studentenwerk_menu_parser = StudentenwerkMenuParser() - menu = studentenwerk_menu_parser.get_menu(tree, canteen, date_) + menu = studentenwerk_menu_parser.get_menu(tree, canteen) if menu is not None: menus[date_] = menu return menus From 4e62829e63df37b7faa5071c4b7fe727a0e4931a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakob=20K=C3=B6rber?= <56073945+jakobkoerber@users.noreply.github.com> Date: Fri, 20 Sep 2024 22:03:12 +0200 Subject: [PATCH 3/4] Fix Linting --- scripts/reformat.py | 1 + src/entities.py | 1 + 2 files changed, 2 insertions(+) diff --git a/scripts/reformat.py b/scripts/reformat.py index 6c76cbe93..b7ee7e3b0 100755 --- a/scripts/reformat.py +++ b/scripts/reformat.py @@ -1,4 +1,5 @@ #!/bin/python3 +# pylint: disable=too-many-arguments import json import os import re diff --git a/src/entities.py b/src/entities.py index b541e1ae5..2cd22a4fb 100644 --- a/src/entities.py +++ b/src/entities.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +# pylint: disable=too-many-arguments # Postponed Evaluation of Annotations to allow using a class inside a class for annotations from __future__ import annotations From 1c83d406f106326f8f5a1fae26d55baf8c78f3c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakob=20K=C3=B6rber?= <56073945+jakobkoerber@users.noreply.github.com> Date: Fri, 20 Sep 2024 22:05:01 +0200 Subject: [PATCH 4/4] Fix Linting :) --- scripts/reformat.py | 2 +- src/entities.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/reformat.py b/scripts/reformat.py index b7ee7e3b0..f78ef9a72 100755 --- a/scripts/reformat.py +++ b/scripts/reformat.py @@ -1,5 +1,5 @@ #!/bin/python3 -# pylint: disable=too-many-arguments +# pylint: disable=too-many-positional-arguments import json import os import re diff --git a/src/entities.py b/src/entities.py index 2cd22a4fb..bdb35c3cf 100644 --- a/src/entities.py +++ b/src/entities.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# pylint: disable=too-many-arguments +# pylint: disable=too-many-positional-arguments # Postponed Evaluation of Annotations to allow using a class inside a class for annotations from __future__ import annotations