diff --git a/README.md b/README.md index bdec2393..1028976c 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ This is equivalent to the above: python3 main.py -a -c <category> <category2> -u <url> <url2> ``` -**OBS**: The url must have the ```https://``` part.<br/> +**OBS**: The url must have a schema like: ```https://``` or ```http://```.<br/> **OBS**: If an error occures when adding a product, then the error might happen because the url has a ```&``` in it, when this happens then just put quotation marks around the url. This should solve the problem. If this doesn't solve the problem then summit a issue.<br/> <br/> diff --git a/scraper/add_product.py b/scraper/add_product.py index b6c99c34..37cc5028 100644 --- a/scraper/add_product.py +++ b/scraper/add_product.py @@ -1,16 +1,17 @@ from typing import List import logging -from scraper.exceptions import WebsiteNotSupported +from scraper.exceptions import WebsiteNotSupported, URLMissingSchema from scraper.scrape import Scraper from scraper.filemanager import Filemanager from scraper.domains import get_website_name, SUPPORTED_DOMAINS +from scraper.constants import URL_SCHEMES def add_products(categories: List[str], urls: List[str]) -> None: for category, url in zip(categories, urls): try: add_product(category, url) - except WebsiteNotSupported as err: + except (WebsiteNotSupported, URLMissingSchema) as err: logging.getLogger(__name__).error(err) print(err) @@ -23,6 +24,9 @@ def add_product(category: str, url: str) -> None: if website_name not in SUPPORTED_DOMAINS.keys(): raise WebsiteNotSupported(website_name) + if is_missing_url_schema(url): + raise URLMissingSchema(url) + print(f"Adding product with category '{category}' and url '{url}'") logger.info(f"Adding product with category '{category}' and url '{url}'") @@ -98,3 +102,7 @@ def check_if_product_exists_csv(product: Scraper) -> bool: return True return False + + +def is_missing_url_schema(url: str) -> bool: + return not any(schema in url for schema in URL_SCHEMES) diff --git a/scraper/constants.py b/scraper/constants.py index caf0be3b..d1f32041 100644 --- a/scraper/constants.py +++ b/scraper/constants.py @@ -5,7 +5,6 @@ REQUEST_COOKIES = {"cookies_are": "working"} - WEBSITE_COLORS = { "komplett": "orange", "proshop": "red", @@ -23,3 +22,5 @@ "newegg": "#f7c20a", "hifiklubben": "#231f20", } + +URL_SCHEMES = ("http://", "https://") diff --git a/scraper/exceptions.py b/scraper/exceptions.py index 0586c7b2..c96b60aa 100644 --- a/scraper/exceptions.py +++ b/scraper/exceptions.py @@ -1,3 +1,6 @@ +from scraper.constants import URL_SCHEMES + + class WebsiteNotSupported(Exception): def __init__(self, website_name: str, *args: object) -> None: super().__init__(*args) @@ -5,3 +8,12 @@ def __init__(self, website_name: str, *args: object) -> None: def __str__(self) -> str: return f"Website '{self.website_name}' is currently not supported" + + +class URLMissingSchema(Exception): + def __init__(self, url, *args: object) -> None: + super().__init__(*args) + self.url = url + + def __str__(self) -> str: + return f"Missing schema in url '{self.url}'. Consider prefixing the url with one of following schemes: {', '.join(URL_SCHEMES)}"