Merge pull request #192 from Crinibus/update-scrape-with-threads-grou…

…p-products-df-by-domains Optimize scraping with threads
Crinibus · Jan 30, 2023 · 7fd8570 · 7fd8570
2 parents f778301 + 58e12dd
commit 7fd8570
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 68 deletions.
diff --git a/main.py b/main.py
@@ -1,3 +1,4 @@
+from typing import List
 import threading
 import logging.config
 import logging
@@ -59,25 +60,36 @@ def scrape_with_threads() -> None:
     print("Scraping with threads...")
 
     request_delay = scraper.Config.get_request_delay()
+
     products_df = scraper.Filemanager.get_products_data()
+    domain_grouped_products_df = scraper.get_products_df_grouped_by_domains(products_df)
+    grouped_products = scraper.get_products_grouped_by_domain(domain_grouped_products_df)
 
-    # Create instances of class "Scraper"
-    products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]
+    grouped_scraper_threads: List[List[threading.Thread]] = []
 
-    # Create threads
-    threads = [threading.Thread(target=product.scrape_info) for product in products]
+    # Create scraper threads and group by domain
+    for products in grouped_products.values():
+        scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
+        grouped_scraper_threads.append(scraper_threads)
 
-    # Start scraping on all threads
-    for thread in threads:
-        time.sleep(request_delay)
-        thread.start()
+    # Create master threads to manage scraper threads sequentially for each domain
+    master_threads = [
+        threading.Thread(target=scraper.start_threads_sequentially, args=[scraper_threads, request_delay])
+        for scraper_threads in grouped_scraper_threads
+    ]
 
-    # Wait for all threads to finish
-    for thread in threads:
-        thread.join()
+    # Start all master threads
+    for master_thread in master_threads:
+        master_thread.start()
+
+    # Wait for all master threads to finish
+    for master_thread in master_threads:
+        master_thread.join()
+
+    products_flatten = [product for products in grouped_products.values() for product in products]
 
     # Save scraped data for each product (sequentially)
-    for product in products:
+    for product in products_flatten:
         product.save_info()
 
 

diff --git a/scraper/__init__.py b/scraper/__init__.py
@@ -1,4 +1,4 @@
-from .scrape import Scraper
+from .scrape import Scraper, start_threads_sequentially
 from .arguments import argparse_setup
 from .add_product import add_products
 from .filemanager import Filemanager, Config
@@ -7,7 +7,8 @@
 from .delete_data import delete
 from .reset_data import reset
 from .search_data import search
-from .misc import print_latest_datapoints, print_all_products
+from .print_products import print_latest_datapoints, print_all_products
+from .misc import get_products_df_grouped_by_domains, get_products_grouped_by_domain
 
 
 __author__ = "Crinibus"
diff --git a/scraper/misc.py b/scraper/misc.py
@@ -1,64 +1,33 @@
-from typing import Iterator, List, Tuple
+import pandas as pd
+from pandas.core.groupby.generic import DataFrameGroupBy
 
-from scraper.filemanager import Filemanager
+from scraper.scrape import Scraper
+from scraper.domains import get_website_name
 
 
-def print_latest_datapoints(names: List[str], ids: List[str]) -> None:
-    records_data = Filemanager.get_record_data()
+def add_dataframe_column(df: pd.DataFrame, column_name: str, column_data: list[any]) -> pd.DataFrame:
+    df[column_name] = column_data
+    return df
 
-    if names:
-        print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
-        for name in names:
-            print(name.upper())
-            # iterate the different websites the product with the specified name is scraped from
-            for website_name, website_dict in get_product_info_with_name(name, records_data):
-                print_latest_datapoint(website_name, website_dict)
-            print()
 
-    if ids:
-        print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
-        for id in ids:
-            product_name, website_name, website_dict = get_product_info_with_id(id, records_data)
-            print(product_name.upper())
-            print_latest_datapoint(website_name, website_dict)
-            print()
+def group_df(df: pd.DataFrame, column_name: str, group_keys: bool) -> DataFrameGroupBy:
+    grouped_df = df.groupby(column_name, group_keys=group_keys)
+    return grouped_df
 
 
-def get_product_info_with_name(name: str, records_data: dict) -> Iterator[Tuple[str, str, dict]]:
-    for category_dict in records_data.values():
-        for product_name, product_dict in category_dict.items():
-            if not product_name.lower() == name.lower():
-                continue
-            for website_name, website_dict in product_dict.items():
-                yield website_name, website_dict
+def get_products_df_grouped_by_domains(products_df: pd.DataFrame) -> DataFrameGroupBy:
+    domain_names = [get_website_name(url) for url in products_df["url"]]
+    df = add_dataframe_column(products_df, "domain", domain_names)
+    grouped_df = group_df(df, "domain", True)
+    return grouped_df
 
 
-def get_product_info_with_id(id: str, records_data: dict) -> Tuple[str, str, dict]:
-    for category_dict in records_data.values():
-        for product_name, product_dict in category_dict.items():
-            for website_name, website_dict in product_dict.items():
-                if website_dict["info"]["id"] == id:
-                    return product_name, website_name, website_dict
+def get_products_grouped_by_domain(grouped_products_df: DataFrameGroupBy) -> dict[str, list[Scraper]]:
+    domains_dict: dict[str, list[Scraper]] = {}
 
-
-def print_latest_datapoint(website_name: str, website_dict: dict) -> None:
-    id = website_dict["info"]["id"]
-    currency = website_dict["info"]["currency"]
-    latest_datapoint = website_dict["datapoints"][-1]
-    date = latest_datapoint["date"]
-    price = latest_datapoint["price"]
-    print(f"> {website_name.capitalize()} - {id}\n  - {currency} {price}\n  - {date}")
-
-
-def print_all_products() -> None:
-    records_data = Filemanager.get_record_data()
-
-    print("\n----- SHOWING ALL PRODUCTS -----")
-    for category_name, category_dict in records_data.items():
-        print(category_name.upper())
-        for product_name, product_dict in category_dict.items():
-            print(f"  > {product_name.upper()}")
-            for website_name, website_dict in product_dict.items():
-                product_id = website_dict["info"]["id"]
-                print(f"    - {website_name.upper()} - {product_id}")
-    print()
+    for domain_name in grouped_products_df.groups:
+        group_products = grouped_products_df.get_group(domain_name)
+        domains_dict[domain_name] = [
+            Scraper(category, url) for category, url in zip(group_products["category"], group_products["url"])
+        ]
+    return domains_dict
diff --git a/scraper/print_products.py b/scraper/print_products.py
@@ -0,0 +1,64 @@
+from typing import Iterator
+
+from scraper.filemanager import Filemanager
+
+
+def print_latest_datapoints(names: list[str], ids: list[str]) -> None:
+    records_data = Filemanager.get_record_data()
+
+    if names:
+        print("\n----- SHOWING LATEST DATAPOINT FOR NAME(s) -----")
+        for name in names:
+            print(name.upper())
+            # iterate the different websites the product with the specified name is scraped from
+            for website_name, website_dict in get_product_info_with_name(name, records_data):
+                print_latest_datapoint(website_name, website_dict)
+            print()
+
+    if ids:
+        print("\n----- SHOWING LATEST DATAPOINT FOR ID(s) -----")
+        for id in ids:
+            product_name, website_name, website_dict = get_product_info_with_id(id, records_data)
+            print(product_name.upper())
+            print_latest_datapoint(website_name, website_dict)
+            print()
+
+
+def get_product_info_with_name(name: str, records_data: dict) -> Iterator[tuple[str, str, dict]]:
+    for category_dict in records_data.values():
+        for product_name, product_dict in category_dict.items():
+            if not product_name.lower() == name.lower():
+                continue
+            for website_name, website_dict in product_dict.items():
+                yield website_name, website_dict
+
+
+def get_product_info_with_id(id: str, records_data: dict) -> tuple[str, str, dict]:
+    for category_dict in records_data.values():
+        for product_name, product_dict in category_dict.items():
+            for website_name, website_dict in product_dict.items():
+                if website_dict["info"]["id"] == id:
+                    return product_name, website_name, website_dict
+
+
+def print_latest_datapoint(website_name: str, website_dict: dict) -> None:
+    id = website_dict["info"]["id"]
+    currency = website_dict["info"]["currency"]
+    latest_datapoint = website_dict["datapoints"][-1]
+    date = latest_datapoint["date"]
+    price = latest_datapoint["price"]
+    print(f"> {website_name.capitalize()} - {id}\n  - {currency} {price}\n  - {date}")
+
+
+def print_all_products() -> None:
+    records_data = Filemanager.get_record_data()
+
+    print("\n----- SHOWING ALL PRODUCTS -----")
+    for category_name, category_dict in records_data.items():
+        print(category_name.upper())
+        for product_name, product_dict in category_dict.items():
+            print(f"  > {product_name.upper()}")
+            for website_name, website_dict in product_dict.items():
+                product_id = website_dict["info"]["id"]
+                print(f"    - {website_name.upper()} - {product_id}")
+    print()
diff --git a/scraper/scrape.py b/scraper/scrape.py
@@ -1,3 +1,5 @@
+import time
+import threading
 import logging
 from datetime import datetime
 from scraper.domains import BaseWebsiteHandler, get_website_handler
@@ -70,3 +72,10 @@ def add_product_datapoint(product_data: dict, price: float) -> None:
         latest_datapoint["price"] = price
     else:
         product_datapoints.append(new_datapoint)
+
+
+def start_threads_sequentially(threads: list[threading.Thread], request_delay: int) -> None:
+    for thread in threads:
+        thread.start()
+        thread.join()
+        time.sleep(request_delay)