From fcf8e5c01fed9342818f0201742581a2c14266c0 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Thu, 17 Nov 2022 21:15:17 +0100 Subject: [PATCH 1/3] Add exception URLMissingSchema and constant URL_SCHEMES --- scraper/constants.py | 3 ++- scraper/exceptions.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/scraper/constants.py b/scraper/constants.py index caf0be3b..d1f32041 100644 --- a/scraper/constants.py +++ b/scraper/constants.py @@ -5,7 +5,6 @@ REQUEST_COOKIES = {"cookies_are": "working"} - WEBSITE_COLORS = { "komplett": "orange", "proshop": "red", @@ -23,3 +22,5 @@ "newegg": "#f7c20a", "hifiklubben": "#231f20", } + +URL_SCHEMES = ("http://", "https://") diff --git a/scraper/exceptions.py b/scraper/exceptions.py index 0586c7b2..c96b60aa 100644 --- a/scraper/exceptions.py +++ b/scraper/exceptions.py @@ -1,3 +1,6 @@ +from scraper.constants import URL_SCHEMES + + class WebsiteNotSupported(Exception): def __init__(self, website_name: str, *args: object) -> None: super().__init__(*args) @@ -5,3 +8,12 @@ def __init__(self, website_name: str, *args: object) -> None: def __str__(self) -> str: return f"Website '{self.website_name}' is currently not supported" + + +class URLMissingSchema(Exception): + def __init__(self, url, *args: object) -> None: + super().__init__(*args) + self.url = url + + def __str__(self) -> str: + return f"Missing schema in url '{self.url}'. Consider prefixing the url with one of following schemes: {', '.join(URL_SCHEMES)}" From f629f8a053f29aff94bf5825b987618c62d74f5e Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Thu, 17 Nov 2022 21:16:14 +0100 Subject: [PATCH 2/3] Update add_product.py - raise exception URLMissingSchema when url is missing schema Add function is_missing_url_schema --- scraper/add_product.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scraper/add_product.py b/scraper/add_product.py index 7612a6c7..7d5d0194 100644 --- a/scraper/add_product.py +++ b/scraper/add_product.py @@ -1,16 +1,17 @@ from typing import List import logging -from scraper.exceptions import WebsiteNotSupported +from scraper.exceptions import WebsiteNotSupported, URLMissingSchema from scraper.scrape import Scraper from scraper.filemanager import Filemanager from scraper.domains import get_website_name, SUPPORTED_DOMAINS +from scraper.constants import URL_SCHEMES def add_products(categories: List[str], urls: List[str]): for category, url in zip(categories, urls): try: add_product(category, url) - except WebsiteNotSupported as err: + except (WebsiteNotSupported, URLMissingSchema) as err: logging.getLogger(__name__).error(err) print(err) @@ -23,6 +24,9 @@ def add_product(category: str, url: str) -> None: if website_name not in SUPPORTED_DOMAINS.keys(): raise WebsiteNotSupported(website_name) + if is_missing_url_schema(url): + raise URLMissingSchema(url) + print(f"Adding product with category '{category}' and url '{url[0:min(50, len(url))]}'...") logger.info(f"Adding product with category '{category}' and url '{url}'") @@ -98,3 +102,7 @@ def check_if_product_exists_csv(product: Scraper): return True return False + + +def is_missing_url_schema(url: str) -> bool: + return not any(schema in url for schema in URL_SCHEMES) From 9938cf67ddea82dcaa0e6376945781abededc749 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Fri, 18 Nov 2022 23:50:14 +0100 Subject: [PATCH 3/3] Update line in README about url must have a schema --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bdec2393..1028976c 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ This is equivalent to the above: python3 main.py -a -c -u ``` -**OBS**: The url must have the ```https://``` part.
+**OBS**: The url must have a schema like: ```https://``` or ```http://```.
**OBS**: If an error occures when adding a product, then the error might happen because the url has a ```&``` in it, when this happens then just put quotation marks around the url. This should solve the problem. If this doesn't solve the problem then summit a issue.