diff --git a/main.py b/main.py index da3d1596..4382183b 100644 --- a/main.py +++ b/main.py @@ -3,8 +3,11 @@ import logging.config import logging import time +import alive_progress import scraper +alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False) + def main() -> None: args = scraper.argparse_setup() @@ -49,11 +52,14 @@ def scrape() -> None: # Create instances of class "Scraper" products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])] - # Scrape and save scraped data for each product (sequentially) - for product in products: - time.sleep(request_delay) - product.scrape_info() - product.save_info() + with alive_progress.alive_bar(len(products), title="Scraping") as bar: + # Scrape and save scraped data for each product (sequentially) + for product in products: + bar.text = f"-> {product.url}" + time.sleep(request_delay) + product.scrape_info() + product.save_info() + bar() def scrape_with_threads() -> None: @@ -72,21 +78,22 @@ def scrape_with_threads() -> None: scraper_threads = [threading.Thread(target=product.scrape_info) for product in products] grouped_scraper_threads.append(scraper_threads) - # Create master threads to manage scraper threads sequentially for each domain - master_threads = [ - threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay]) - for scraper_threads in grouped_scraper_threads - ] + products_flatten = [product for products in grouped_products.values() for product in products] - # Start all master threads - for master_thread in master_threads: - master_thread.start() + with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar: + # Create master threads to manage scraper threads sequentially for each domain + master_threads = [ + threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar]) + for scraper_threads in grouped_scraper_threads + ] - # Wait for all master threads to finish - for master_thread in master_threads: - master_thread.join() + # Start all master threads + for master_thread in master_threads: + master_thread.start() - products_flatten = [product for products in grouped_products.values() for product in products] + # Wait for all master threads to finish + for master_thread in master_threads: + master_thread.join() # Save scraped data for each product (sequentially) for product in products_flatten: diff --git a/requirements.txt b/requirements.txt index cf44c310..0ee0a012 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ plotly>=4.12.0 pandas>=1.1.3 pytest>=7.1.2 pytest-mock>=3.8.2 +alive-progress>=2.4.1 diff --git a/scraper/scrape.py b/scraper/scrape.py index 881a420f..f8d40292 100644 --- a/scraper/scrape.py +++ b/scraper/scrape.py @@ -74,8 +74,11 @@ def add_product_datapoint(product_data: dict, price: float) -> None: product_datapoints.append(new_datapoint) -def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None: +def start_threads_sequentially(threads: list[threading.Thread], request_delay: int, progress_bar=None) -> None: for thread in threads: thread.start() thread.join() time.sleep(request_delay) + + if progress_bar: + progress_bar()