Skip to content

Commit

Permalink
Merge pull request #182 from Crinibus/add-progress-bar
Browse files Browse the repository at this point in the history
Add alive-progress progress bar when scraping
  • Loading branch information
Crinibus authored Jan 30, 2023
2 parents 7fd8570 + 119f40e commit 6443514
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 18 deletions.
41 changes: 24 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import logging.config
import logging
import time
import alive_progress
import scraper

alive_progress.config_handler.set_global(ctrl_c=False, dual_line=True, theme="classic", stats=False)


def main() -> None:
args = scraper.argparse_setup()
Expand Down Expand Up @@ -49,11 +52,14 @@ def scrape() -> None:
# Create instances of class "Scraper"
products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]

# Scrape and save scraped data for each product (sequentially)
for product in products:
time.sleep(request_delay)
product.scrape_info()
product.save_info()
with alive_progress.alive_bar(len(products), title="Scraping") as bar:
# Scrape and save scraped data for each product (sequentially)
for product in products:
bar.text = f"-> {product.url}"
time.sleep(request_delay)
product.scrape_info()
product.save_info()
bar()


def scrape_with_threads() -> None:
Expand All @@ -72,21 +78,22 @@ def scrape_with_threads() -> None:
scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
grouped_scraper_threads.append(scraper_threads)

# Create master threads to manage scraper threads sequentially for each domain
master_threads = [
threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay])
for scraper_threads in grouped_scraper_threads
]
products_flatten = [product for products in grouped_products.values() for product in products]

# Start all master threads
for master_thread in master_threads:
master_thread.start()
with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar:
# Create master threads to manage scraper threads sequentially for each domain
master_threads = [
threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay, progress_bar])
for scraper_threads in grouped_scraper_threads
]

# Wait for all master threads to finish
for master_thread in master_threads:
master_thread.join()
# Start all master threads
for master_thread in master_threads:
master_thread.start()

products_flatten = [product for products in grouped_products.values() for product in products]
# Wait for all master threads to finish
for master_thread in master_threads:
master_thread.join()

# Save scraped data for each product (sequentially)
for product in products_flatten:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ plotly>=4.12.0
pandas>=1.1.3
pytest>=7.1.2
pytest-mock>=3.8.2
alive-progress>=2.4.1
5 changes: 4 additions & 1 deletion scraper/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,11 @@ def add_product_datapoint(product_data: dict, price: float) -> None:
product_datapoints.append(new_datapoint)


def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None:
def start_threads_sequentially(threads: list[threading.Thread], request_delay: int, progress_bar=None) -> None:
for thread in threads:
thread.start()
thread.join()
time.sleep(request_delay)

if progress_bar:
progress_bar()

0 comments on commit 6443514

Please sign in to comment.