Skip to content

Commit

Permalink
Merge pull request #192 from Crinibus/update-scrape-with-threads-grou…
Browse files Browse the repository at this point in the history
…p-products-df-by-domains

Optimize scraping with threads
  • Loading branch information
Crinibus authored Jan 30, 2023
2 parents f778301 + 58e12dd commit 7fd8570
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 68 deletions.
36 changes: 24 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import List
import threading
import logging.config
import logging
Expand Down Expand Up @@ -59,25 +60,36 @@ def scrape_with_threads() -> None:
print("Scraping with threads...")

request_delay = scraper.Config.get_request_delay()

products_df = scraper.Filemanager.get_products_data()
domain_grouped_products_df = scraper.get_products_df_grouped_by_domains(products_df)
grouped_products = scraper.get_products_grouped_by_domain(domain_grouped_products_df)

# Create instances of class "Scraper"
products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]
grouped_scraper_threads: List[List[threading.Thread]] = []

# Create threads
threads = [threading.Thread(target=product.scrape_info) for product in products]
# Create scraper threads and group by domain
for products in grouped_products.values():
scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
grouped_scraper_threads.append(scraper_threads)

# Start scraping on all threads
for thread in threads:
time.sleep(request_delay)
thread.start()
# Create master threads to manage scraper threads sequentially for each domain
master_threads = [
threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay])
for scraper_threads in grouped_scraper_threads
]

# Wait for all threads to finish
for thread in threads:
thread.join()
# Start all master threads
for master_thread in master_threads:
master_thread.start()

# Wait for all master threads to finish
for master_thread in master_threads:
master_thread.join()

products_flatten = [product for products in grouped_products.values() for product in products]

# Save scraped data for each product (sequentially)
for product in products:
for product in products_flatten:
product.save_info()


Expand Down
5 changes: 3 additions & 2 deletions scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .scrape import Scraper
from .scrape import Scraper, start_threads_sequentially
from .arguments import argparse_setup
from .add_product import add_products
from .filemanager import Filemanager, Config
Expand All @@ -7,7 +7,8 @@
from .delete_data import delete
from .reset_data import reset
from .search_data import search
from .misc import print_latest_datapoints, print_all_products
from .print_products import print_latest_datapoints, print_all_products
from .misc import get_products_df_grouped_by_domains, get_products_grouped_by_domain


__author__ = "Crinibus"
77 changes: 23 additions & 54 deletions scraper/misc.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,33 @@
from typing import Iterator, List, Tuple
import pandas as pd
from pandas.core.groupby.generic import DataFrameGroupBy

from scraper.filemanager import Filemanager
from scraper.scrape import Scraper
from scraper.domains import get_website_name


def print_latest_datapoints(names: List[str], ids: List[str]) -> None:
records_data = Filemanager.get_record_data()
def add_dataframe_column(df: pd.DataFrame, column_name: str, column_data: list[any]) -> pd.DataFrame:
df[column_name] = column_data
return df

if names:
print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
for name in names:
print(name.upper())
# iterate the different websites the product with the specified name is scraped from
for website_name, website_dict in get_product_info_with_name(name, records_data):
print_latest_datapoint(website_name, website_dict)
print()

if ids:
print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
for id in ids:
product_name, website_name, website_dict = get_product_info_with_id(id, records_data)
print(product_name.upper())
print_latest_datapoint(website_name, website_dict)
print()
def group_df(df: pd.DataFrame, column_name: str, group_keys: bool) -> DataFrameGroupBy:
grouped_df = df.groupby(column_name, group_keys=group_keys)
return grouped_df


def get_product_info_with_name(name: str, records_data: dict) -> Iterator[Tuple[str, str, dict]]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
if not product_name.lower() == name.lower():
continue
for website_name, website_dict in product_dict.items():
yield website_name, website_dict
def get_products_df_grouped_by_domains(products_df: pd.DataFrame) -> DataFrameGroupBy:
domain_names = [get_website_name(url) for url in products_df["url"]]
df = add_dataframe_column(products_df, "domain", domain_names)
grouped_df = group_df(df, "domain", True)
return grouped_df


def get_product_info_with_id(id: str, records_data: dict) -> Tuple[str, str, dict]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
for website_name, website_dict in product_dict.items():
if website_dict["info"]["id"] == id:
return product_name, website_name, website_dict
def get_products_grouped_by_domain(grouped_products_df: DataFrameGroupBy) -> dict[str, list[Scraper]]:
domains_dict: dict[str, list[Scraper]] = {}


def print_latest_datapoint(website_name: str, website_dict: dict) -> None:
id = website_dict["info"]["id"]
currency = website_dict["info"]["currency"]
latest_datapoint = website_dict["datapoints"][-1]
date = latest_datapoint["date"]
price = latest_datapoint["price"]
print(f"> {website_name.capitalize()} - {id}\n - {currency} {price}\n - {date}")


def print_all_products() -> None:
records_data = Filemanager.get_record_data()

print("\n----- SHOWING ALL PRODUCTS -----")
for category_name, category_dict in records_data.items():
print(category_name.upper())
for product_name, product_dict in category_dict.items():
print(f" > {product_name.upper()}")
for website_name, website_dict in product_dict.items():
product_id = website_dict["info"]["id"]
print(f" - {website_name.upper()} - {product_id}")
print()
for domain_name in grouped_products_df.groups:
group_products = grouped_products_df.get_group(domain_name)
domains_dict[domain_name] = [
Scraper(category, url) for category, url in zip(group_products["category"], group_products["url"])
]
return domains_dict
64 changes: 64 additions & 0 deletions scraper/print_products.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from typing import Iterator

from scraper.filemanager import Filemanager


def print_latest_datapoints(names: list[str], ids: list[str]) -> None:
records_data = Filemanager.get_record_data()

if names:
print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
for name in names:
print(name.upper())
# iterate the different websites the product with the specified name is scraped from
for website_name, website_dict in get_product_info_with_name(name, records_data):
print_latest_datapoint(website_name, website_dict)
print()

if ids:
print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
for id in ids:
product_name, website_name, website_dict = get_product_info_with_id(id, records_data)
print(product_name.upper())
print_latest_datapoint(website_name, website_dict)
print()


def get_product_info_with_name(name: str, records_data: dict) -> Iterator[tuple[str, str, dict]]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
if not product_name.lower() == name.lower():
continue
for website_name, website_dict in product_dict.items():
yield website_name, website_dict


def get_product_info_with_id(id: str, records_data: dict) -> tuple[str, str, dict]:
for category_dict in records_data.values():
for product_name, product_dict in category_dict.items():
for website_name, website_dict in product_dict.items():
if website_dict["info"]["id"] == id:
return product_name, website_name, website_dict


def print_latest_datapoint(website_name: str, website_dict: dict) -> None:
id = website_dict["info"]["id"]
currency = website_dict["info"]["currency"]
latest_datapoint = website_dict["datapoints"][-1]
date = latest_datapoint["date"]
price = latest_datapoint["price"]
print(f"> {website_name.capitalize()} - {id}\n - {currency} {price}\n - {date}")


def print_all_products() -> None:
records_data = Filemanager.get_record_data()

print("\n----- SHOWING ALL PRODUCTS -----")
for category_name, category_dict in records_data.items():
print(category_name.upper())
for product_name, product_dict in category_dict.items():
print(f" > {product_name.upper()}")
for website_name, website_dict in product_dict.items():
product_id = website_dict["info"]["id"]
print(f" - {website_name.upper()} - {product_id}")
print()
9 changes: 9 additions & 0 deletions scraper/scrape.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time
import threading
import logging
from datetime import datetime
from scraper.domains import BaseWebsiteHandler, get_website_handler
Expand Down Expand Up @@ -70,3 +72,10 @@ def add_product_datapoint(product_data: dict, price: float) -> None:
latest_datapoint["price"] = price
else:
product_datapoints.append(new_datapoint)


def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None:
for thread in threads:
thread.start()
thread.join()
time.sleep(request_delay)

0 comments on commit 7fd8570

Please sign in to comment.