diff --git a/main.py b/main.py index 951e8714..da3d1596 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +from typing import List import threading import logging.config import logging @@ -59,25 +60,36 @@ def scrape_with_threads() -> None: print("Scraping with threads...") request_delay = scraper.Config.get_request_delay() + products_df = scraper.Filemanager.get_products_data() + domain_grouped_products_df = scraper.get_products_df_grouped_by_domains(products_df) + grouped_products = scraper.get_products_grouped_by_domain(domain_grouped_products_df) - # Create instances of class "Scraper" - products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])] + grouped_scraper_threads: List[List[threading.Thread]] = [] - # Create threads - threads = [threading.Thread(target=product.scrape_info) for product in products] + # Create scraper threads and group by domain + for products in grouped_products.values(): + scraper_threads = [threading.Thread(target=product.scrape_info) for product in products] + grouped_scraper_threads.append(scraper_threads) - # Start scraping on all threads - for thread in threads: - time.sleep(request_delay) - thread.start() + # Create master threads to manage scraper threads sequentially for each domain + master_threads = [ + threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay]) + for scraper_threads in grouped_scraper_threads + ] - # Wait for all threads to finish - for thread in threads: - thread.join() + # Start all master threads + for master_thread in master_threads: + master_thread.start() + + # Wait for all master threads to finish + for master_thread in master_threads: + master_thread.join() + + products_flatten = [product for products in grouped_products.values() for product in products] # Save scraped data for each product (sequentially) - for product in products: + for product in products_flatten: product.save_info() diff --git a/scraper/__init__.py b/scraper/__init__.py index 034befcf..6f835230 100644 --- a/scraper/__init__.py +++ b/scraper/__init__.py @@ -1,4 +1,4 @@ -from .scrape import Scraper +from .scrape import Scraper, start_threads_sequentially from .arguments import argparse_setup from .add_product import add_products from .filemanager import Filemanager, Config @@ -7,7 +7,8 @@ from .delete_data import delete from .reset_data import reset from .search_data import search -from .misc import print_latest_datapoints, print_all_products +from .print_products import print_latest_datapoints, print_all_products +from .misc import get_products_df_grouped_by_domains, get_products_grouped_by_domain __author__ = "Crinibus" diff --git a/scraper/misc.py b/scraper/misc.py index fff37502..5004b59f 100644 --- a/scraper/misc.py +++ b/scraper/misc.py @@ -1,64 +1,33 @@ -from typing import Iterator, List, Tuple +import pandas as pd +from pandas.core.groupby.generic import DataFrameGroupBy -from scraper.filemanager import Filemanager +from scraper.scrape import Scraper +from scraper.domains import get_website_name -def print_latest_datapoints(names: List[str], ids: List[str]) -> None: - records_data = Filemanager.get_record_data() +def add_dataframe_column(df: pd.DataFrame, column_name: str, column_data: list[any]) -> pd.DataFrame: + df[column_name] = column_data + return df - if names: - print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----") - for name in names: - print(name.upper()) - # iterate the different websites the product with the specified name is scraped from - for website_name, website_dict in get_product_info_with_name(name, records_data): - print_latest_datapoint(website_name, website_dict) - print() - if ids: - print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----") - for id in ids: - product_name, website_name, website_dict = get_product_info_with_id(id, records_data) - print(product_name.upper()) - print_latest_datapoint(website_name, website_dict) - print() +def group_df(df: pd.DataFrame, column_name: str, group_keys: bool) -> DataFrameGroupBy: + grouped_df = df.groupby(column_name, group_keys=group_keys) + return grouped_df -def get_product_info_with_name(name: str, records_data: dict) -> Iterator[Tuple[str, str, dict]]: - for category_dict in records_data.values(): - for product_name, product_dict in category_dict.items(): - if not product_name.lower() == name.lower(): - continue - for website_name, website_dict in product_dict.items(): - yield website_name, website_dict +def get_products_df_grouped_by_domains(products_df: pd.DataFrame) -> DataFrameGroupBy: + domain_names = [get_website_name(url) for url in products_df["url"]] + df = add_dataframe_column(products_df, "domain", domain_names) + grouped_df = group_df(df, "domain", True) + return grouped_df -def get_product_info_with_id(id: str, records_data: dict) -> Tuple[str, str, dict]: - for category_dict in records_data.values(): - for product_name, product_dict in category_dict.items(): - for website_name, website_dict in product_dict.items(): - if website_dict["info"]["id"] == id: - return product_name, website_name, website_dict +def get_products_grouped_by_domain(grouped_products_df: DataFrameGroupBy) -> dict[str, list[Scraper]]: + domains_dict: dict[str, list[Scraper]] = {} - -def print_latest_datapoint(website_name: str, website_dict: dict) -> None: - id = website_dict["info"]["id"] - currency = website_dict["info"]["currency"] - latest_datapoint = website_dict["datapoints"][-1] - date = latest_datapoint["date"] - price = latest_datapoint["price"] - print(f"> {website_name.capitalize()} - {id}\n - {currency} {price}\n - {date}") - - -def print_all_products() -> None: - records_data = Filemanager.get_record_data() - - print("\n----- SHOWING ALL PRODUCTS -----") - for category_name, category_dict in records_data.items(): - print(category_name.upper()) - for product_name, product_dict in category_dict.items(): - print(f" > {product_name.upper()}") - for website_name, website_dict in product_dict.items(): - product_id = website_dict["info"]["id"] - print(f" - {website_name.upper()} - {product_id}") - print() + for domain_name in grouped_products_df.groups: + group_products = grouped_products_df.get_group(domain_name) + domains_dict[domain_name] = [ + Scraper(category, url) for category, url in zip(group_products["category"], group_products["url"]) + ] + return domains_dict diff --git a/scraper/print_products.py b/scraper/print_products.py new file mode 100644 index 00000000..7f692c43 --- /dev/null +++ b/scraper/print_products.py @@ -0,0 +1,64 @@ +from typing import Iterator + +from scraper.filemanager import Filemanager + + +def print_latest_datapoints(names: list[str], ids: list[str]) -> None: + records_data = Filemanager.get_record_data() + + if names: + print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----") + for name in names: + print(name.upper()) + # iterate the different websites the product with the specified name is scraped from + for website_name, website_dict in get_product_info_with_name(name, records_data): + print_latest_datapoint(website_name, website_dict) + print() + + if ids: + print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----") + for id in ids: + product_name, website_name, website_dict = get_product_info_with_id(id, records_data) + print(product_name.upper()) + print_latest_datapoint(website_name, website_dict) + print() + + +def get_product_info_with_name(name: str, records_data: dict) -> Iterator[tuple[str, str, dict]]: + for category_dict in records_data.values(): + for product_name, product_dict in category_dict.items(): + if not product_name.lower() == name.lower(): + continue + for website_name, website_dict in product_dict.items(): + yield website_name, website_dict + + +def get_product_info_with_id(id: str, records_data: dict) -> tuple[str, str, dict]: + for category_dict in records_data.values(): + for product_name, product_dict in category_dict.items(): + for website_name, website_dict in product_dict.items(): + if website_dict["info"]["id"] == id: + return product_name, website_name, website_dict + + +def print_latest_datapoint(website_name: str, website_dict: dict) -> None: + id = website_dict["info"]["id"] + currency = website_dict["info"]["currency"] + latest_datapoint = website_dict["datapoints"][-1] + date = latest_datapoint["date"] + price = latest_datapoint["price"] + print(f"> {website_name.capitalize()} - {id}\n - {currency} {price}\n - {date}") + + +def print_all_products() -> None: + records_data = Filemanager.get_record_data() + + print("\n----- SHOWING ALL PRODUCTS -----") + for category_name, category_dict in records_data.items(): + print(category_name.upper()) + for product_name, product_dict in category_dict.items(): + print(f" > {product_name.upper()}") + for website_name, website_dict in product_dict.items(): + product_id = website_dict["info"]["id"] + print(f" - {website_name.upper()} - {product_id}") + print() diff --git a/scraper/scrape.py b/scraper/scrape.py index 88093532..881a420f 100644 --- a/scraper/scrape.py +++ b/scraper/scrape.py @@ -1,3 +1,5 @@ +import time +import threading import logging from datetime import datetime from scraper.domains import BaseWebsiteHandler, get_website_handler @@ -70,3 +72,10 @@ def add_product_datapoint(product_data: dict, price: float) -> None: latest_datapoint["price"] = price else: product_datapoints.append(new_datapoint) + + +def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None: + for thread in threads: + thread.start() + thread.join() + time.sleep(request_delay)