From 2757fac1bc44bfce95dfcf1ddf4b4a74eda5542a Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Thu, 6 Oct 2022 22:55:06 +0200 Subject: [PATCH 1/4] Add alive_progress - Known issue with my current logging configs and alive_progress --- main.py | 39 ++++++++++++++++++++++++++++----------- requirements.txt | 1 + 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 7c0752e3..f5e14179 100644 --- a/main.py +++ b/main.py @@ -2,8 +2,11 @@ import logging.config import logging import time +import alive_progress import scraper +alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False) + def main(): args = scraper.argparse_setup() @@ -49,10 +52,19 @@ def scrape(): products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])] # Scrape and save scraped data for each product (sequentially) - for product in products: - time.sleep(request_delay) - product.scrape_info() - product.save_info() + # for product in products: + # time.sleep(request_delay) + # product.scrape_info() + # product.save_info() + + with alive_progress.alive_bar(len(products), title="Scraping") as bar: + # Scrape and save scraped data for each product (sequentially) + for product in products: + bar.text = f"-> {product.url}" + time.sleep(request_delay) + product.scrape_info() + product.save_info() + bar() def scrape_with_threads(): @@ -67,14 +79,16 @@ def scrape_with_threads(): # Create threads threads = [threading.Thread(target=product.scrape_info) for product in products] - # Start scraping on all threads - for thread in threads: - time.sleep(request_delay) - thread.start() + with alive_progress.alive_bar(len(products), title="Scraping") as bar: + # Start scraping on all threads + for thread in threads: + time.sleep(request_delay) + thread.start() - # Wait for all threads to finish - for thread in threads: - thread.join() + # Wait for all threads to finish + for thread in threads: + thread.join() + bar() # Save scraped data for each product (sequentially) for product in products: @@ -82,6 +96,9 @@ def scrape_with_threads(): if __name__ == "__main__": + + # DON'T MERGE WITH MASTER BRANCH: KNOWN ISSUE: https://github.com/rsalmei/alive-progress/issues/155 + # alive_progress crashes with the below logging config settings logging.config.fileConfig( fname=scraper.Filemanager.logging_ini_path, defaults={"logfilename": scraper.Filemanager.logfile_path}, diff --git a/requirements.txt b/requirements.txt index cf44c310..0ee0a012 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ plotly>=4.12.0 pandas>=1.1.3 pytest>=7.1.2 pytest-mock>=3.8.2 +alive-progress>=2.4.1 From d1619f39424e3a54b8dbe9c3a6b5b07843622915 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Mon, 2 Jan 2023 23:30:37 +0100 Subject: [PATCH 2/4] With version 3.0 of alive-progress, there is no issue with my logging setup and alive-progress --- main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/main.py b/main.py index 8a359f77..bf83c4a6 100644 --- a/main.py +++ b/main.py @@ -96,9 +96,6 @@ def scrape_with_threads(): if __name__ == "__main__": - - # DON'T MERGE WITH MASTER BRANCH: KNOWN ISSUE: https://github.com/rsalmei/alive-progress/issues/155 - # alive_progress crashes with the below logging config settings logging.config.fileConfig( fname=scraper.Filemanager.logging_ini_path, defaults={"logfilename": scraper.Filemanager.logfile_path}, From 41510f0d22af84ba129160f7d6d2a8bfec94b8f6 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Mon, 30 Jan 2023 19:57:12 +0100 Subject: [PATCH 3/4] Update functions scrape_with_threads and start_threads_sequentially Add alive_progress context manager to function scrape_with_threads Add parameter to function start_threads_sequentially to pass a alive_progress bar to --- main.py | 25 +++++++++++++------------ scraper/scrape.py | 5 ++++- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index f750a1fb..d5b2515a 100644 --- a/main.py +++ b/main.py @@ -84,21 +84,22 @@ def scrape_with_threads() -> None: scraper_threads = [threading.Thread(target=product.scrape_info) for product in products] grouped_scraper_threads.append(scraper_threads) - # Create master threads to manage scraper threads sequentially for each domain - master_threads = [ - threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay]) - for scraper_threads in grouped_scraper_threads - ] + products_flatten = [product for products in grouped_products.values() for product in products] - # Start all master threads - for master_thread in master_threads: - master_thread.start() + with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar: + # Create master threads to manage scraper threads sequentially for each domain + master_threads = [ + threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar]) + for scraper_threads in grouped_scraper_threads + ] - # Wait for all master threads to finish - for master_thread in master_threads: - master_thread.join() + # Start all master threads + for master_thread in master_threads: + master_thread.start() - products_flatten = [product for products in grouped_products.values() for product in products] + # Wait for all master threads to finish + for master_thread in master_threads: + master_thread.join() # Save scraped data for each product (sequentially) for product in products_flatten: diff --git a/scraper/scrape.py b/scraper/scrape.py index 881a420f..f8d40292 100644 --- a/scraper/scrape.py +++ b/scraper/scrape.py @@ -74,8 +74,11 @@ def add_product_datapoint(product_data: dict, price: float) -> None: product_datapoints.append(new_datapoint) -def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None: +def start_threads_sequentially(threads: list[threading.Thread], request_delay: int, progress_bar=None) -> None: for thread in threads: thread.start() thread.join() time.sleep(request_delay) + + if progress_bar: + progress_bar() From 119f40ea41b0119cdb5bb6f9285b93bc34070994 Mon Sep 17 00:00:00 2001 From: Crinibus <57172157+Crinibus@users.noreply.github.com> Date: Mon, 30 Jan 2023 19:58:16 +0100 Subject: [PATCH 4/4] Delete commented out code --- main.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/main.py b/main.py index d5b2515a..4382183b 100644 --- a/main.py +++ b/main.py @@ -52,12 +52,6 @@ def scrape() -> None: # Create instances of class "Scraper" products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])] - # Scrape and save scraped data for each product (sequentially) - # for product in products: - # time.sleep(request_delay) - # product.scrape_info() - # product.save_info() - with alive_progress.alive_bar(len(products), title="Scraping") as bar: # Scrape and save scraped data for each product (sequentially) for product in products: