Merge pull request #182 from Crinibus/add-progress-bar

Add alive-progress progress bar when scraping
Crinibus · Jan 30, 2023 · 6443514 · 6443514
2 parents 7fd8570 + 119f40e
commit 6443514
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 18 deletions.
diff --git a/main.py b/main.py
@@ -3,8 +3,11 @@
 import logging.config
 import logging
 import time
+import alive_progress
 import scraper
 
+alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False)
+
 
 def main() -> None:
     args = scraper.argparse_setup()
@@ -49,11 +52,14 @@ def scrape() -> None:
     # Create instances of class "Scraper"
     products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]
 
-    # Scrape and save scraped data for each product (sequentially)
-    for product in products:
-        time.sleep(request_delay)
-        product.scrape_info()
-        product.save_info()
+    with alive_progress.alive_bar(len(products), title="Scraping") as bar:
+        # Scrape and save scraped data for each product (sequentially)
+        for product in products:
+            bar.text = f"-> {product.url}"
+            time.sleep(request_delay)
+            product.scrape_info()
+            product.save_info()
+            bar()
 
 
 def scrape_with_threads() -> None:
@@ -72,21 +78,22 @@ def scrape_with_threads() -> None:
         scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
         grouped_scraper_threads.append(scraper_threads)
 
-    # Create master threads to manage scraper threads sequentially for each domain
-    master_threads = [
-        threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay])
-        for scraper_threads in grouped_scraper_threads
-    ]
+    products_flatten = [product for products in grouped_products.values() for product in products]
 
-    # Start all master threads
-    for master_thread in master_threads:
-        master_thread.start()
+    with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar:
+        # Create master threads to manage scraper threads sequentially for each domain
+        master_threads = [
+            threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar])
+            for scraper_threads in grouped_scraper_threads
+        ]
 
-    # Wait for all master threads to finish
-    for master_thread in master_threads:
-        master_thread.join()
+        # Start all master threads
+        for master_thread in master_threads:
+            master_thread.start()
 
-    products_flatten = [product for products in grouped_products.values() for product in products]
+        # Wait for all master threads to finish
+        for master_thread in master_threads:
+            master_thread.join()
 
     # Save scraped data for each product (sequentially)
     for product in products_flatten:

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ plotly>=4.12.0
 pandas>=1.1.3
 pytest>=7.1.2
 pytest-mock>=3.8.2
+alive-progress>=2.4.1
diff --git a/scraper/scrape.py b/scraper/scrape.py
@@ -74,8 +74,11 @@ def add_product_datapoint(product_data: dict, price: float) -> None:
         product_datapoints.append(new_datapoint)
 
 
-def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None:
+def start_threads_sequentially(threads: list[threading.Thread], request_delay: int, progress_bar=None) -> None:
     for thread in threads:
         thread.start()
         thread.join()
         time.sleep(request_delay)
+
+        if progress_bar:
+            progress_bar()