diff --git a/README.md b/README.md
index 82bdb4ad..2628117a 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
- [Intro](#intro)
- [Contributing](#contributing)
- [Installation](#installation)
-- [Start from scratch](#start-scratch)
+- [Delete data](#delete-data)
- [Scrape products](#scrape-products)
- [Add products](#add-products)
- [Websites to scrape from](#websites-to-scrape-from)
@@ -67,22 +67,47 @@ pip3 install -r requirements.txt
-## Start from scratch
+## Delete data
+
If you want to start from scratch with no data in the records.json and products.csv files, then just run the following command:
```
-python3 main.py --hard-reset
+python3 main.py --delete --all
+```
+
+You can also just delete some products or some categories:
+```
+python3 main.py --delete --id
+```
```
+python3 main.py --delete --name
+```
+```
+python3 main.py --delete --category
+```
+
Then just add products like described [here](#add-products).
-If you just want to reset your data for each product, just delete all datapoints inside each product, then run this command:
+If you just want to reset your data for every product, deleting all datapoints inside every product, then run this command:
```
-python3 main.py --reset
+python3 main.py --reset --all
```
This deletes the data inside each product, such as id, url and all datapoints.
+
+You can also just reset some products or all products in some categories:
+```
+python3 main.py --reset --id
+```
+```
+python3 main.py --reset --name
+```
+```
+python3 main.py --reset --category
+```
+
diff --git a/main.py b/main.py
index 7c0752e3..b9f595f4 100644
--- a/main.py
+++ b/main.py
@@ -15,10 +15,7 @@ def main():
scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare)
if args.reset:
- scraper.reset()
-
- if args.hard_reset:
- scraper.hard_reset()
+ scraper.reset(args.category, args.name, args.id, args.all)
if args.add:
scraper.add_products(args.category, args.url)
@@ -38,6 +35,9 @@ def main():
if args.print_all_products:
scraper.print_all_products()
+ if args.delete:
+ scraper.delete(args.category, args.name, args.id, args.all)
+
def scrape():
print("Scraping...")
diff --git a/scraper/__init__.py b/scraper/__init__.py
index f20873e2..034befcf 100644
--- a/scraper/__init__.py
+++ b/scraper/__init__.py
@@ -4,7 +4,8 @@
from .filemanager import Filemanager, Config
from .visualize import visualize_data
from .clean_data import clean_records_data
-from .reset_data import reset, hard_reset
+from .delete_data import delete
+from .reset_data import reset
from .search_data import search
from .misc import print_latest_datapoints, print_all_products
diff --git a/scraper/arguments.py b/scraper/arguments.py
index 79910e8f..b3cff5b1 100644
--- a/scraper/arguments.py
+++ b/scraper/arguments.py
@@ -28,6 +28,7 @@ def argparse_setup() -> ArgumentParser.parse_args:
type=str,
nargs="*",
action="extend",
+ default=[],
)
parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend")
@@ -99,12 +100,6 @@ def argparse_setup() -> ArgumentParser.parse_args:
action="store_true",
)
- parser.add_argument(
- "--hard-reset",
- help="delete all content in records.json and products.csv",
- action="store_true",
- )
-
parser.add_argument(
"--clean-data",
help="clean data so unnecessary product datapoints is removed from records",
@@ -126,6 +121,13 @@ def argparse_setup() -> ArgumentParser.parse_args:
action="store_true",
)
+ parser.add_argument(
+ "--delete",
+ help="delete all or specific products or categories",
+ dest="delete",
+ action="store_true",
+ )
+
args = validate_arguments(parser)
return args
@@ -138,6 +140,10 @@ def validate_arguments(parser: ArgumentParser) -> None:
if args.add and args.visualize:
parser.error("Cannot use --add and --visualize at the same time")
+ if args.delete:
+ if args.all and any([args.category, args.name, args.id]):
+ parser.error("When using --delete and --all, then using --category, --name or --id does nothing")
+
if args.add:
if not args.category or not args.url:
parser.error("When using --add, then --category and --url is required")
diff --git a/scraper/delete_data.py b/scraper/delete_data.py
new file mode 100644
index 00000000..37a568be
--- /dev/null
+++ b/scraper/delete_data.py
@@ -0,0 +1,103 @@
+from typing import List, Tuple
+import logging
+import pandas as pd
+
+from scraper.filemanager import Filemanager
+
+
+def delete(categories: List[str], names: List[str], ids: List[str], all: bool) -> None:
+ print("Deleting...")
+ record_data = Filemanager.get_record_data()
+
+ if all:
+ print("Deleting all products and categories...")
+ logging.getLogger(__name__).info("Deleting all products and categories")
+ delete_all()
+ return
+
+ logging.getLogger(__name__).info(f"Deleting categories: {categories}, product names: {names} and product ids: {ids}")
+ delete_from_record_data(record_data, categories, names, ids)
+
+ Filemanager.save_record_data(record_data)
+
+
+def delete_all() -> None:
+ # Save an empty dictionary
+ Filemanager.save_record_data({})
+ Filemanager.clear_product_csv()
+
+
+def delete_from_record_data(
+ record_data: dict,
+ category_names_to_delete: List[str] = None,
+ names_to_delete: List[str] = None,
+ ids_to_delete: List[str] = None,
+) -> None:
+ category_names_to_delete = [] if category_names_to_delete is None else category_names_to_delete
+ names_to_delete = [] if names_to_delete is None else names_to_delete
+ ids_to_delete = [] if ids_to_delete is None else ids_to_delete
+
+ categories_to_delete, products_to_delete_names, products_to_delete_ids = get_categories_products_ids_to_delete(
+ record_data, category_names_to_delete, names_to_delete, ids_to_delete
+ )
+
+ products_df = Filemanager.get_products_data()
+
+ # Delete product ids
+ for product_to_delete_id in products_to_delete_ids:
+ category_name, product_name, website_name = product_to_delete_id
+ deleted_website_dict = record_data[category_name][product_name].pop(website_name)
+
+ # Delete the row with short_url in products_df that match with the product's url that are deleted from record_data
+ url_to_delete = deleted_website_dict["info"]["url"]
+ products_df = delete_dataframe_rows(products_df, "short_url", url_to_delete)
+
+ # Delete product names
+ for product_to_delete_name in products_to_delete_names:
+ category_name, product_name = product_to_delete_name
+ deleted_product_dict = record_data[category_name].pop(product_name)
+
+ # Delete rows with short_url in products_df that match with the products urls that are deleted from record_data
+ for deleted_website_dict in deleted_product_dict.values():
+ url_to_delete = deleted_website_dict["info"]["url"]
+ products_df = delete_dataframe_rows(products_df, "short_url", url_to_delete)
+
+ # Delete categories
+ for category_to_delete in categories_to_delete:
+ record_data.pop(category_to_delete)
+
+ # Delete rows with category in products_df that match with the category to delete
+ products_df = delete_dataframe_rows(products_df, "category", category_to_delete)
+
+ Filemanager.save_products_data(products_df)
+
+
+def get_categories_products_ids_to_delete(
+ record_data, category_names_to_delete, names_to_delete, ids_to_delete
+) -> Tuple[List[str], List[Tuple[str, str]], List[Tuple[str, str, str]]]:
+ categories_to_delete: List[str] = []
+ products_to_delete_names: List[Tuple[str, str]] = []
+ products_to_delete_ids: List[Tuple[str, str, str]] = []
+
+ # Find the "paths" for categories, product names and ids to delete
+ for category_name, category_dict in record_data.items():
+ if category_name in category_names_to_delete:
+ categories_to_delete.append(category_name)
+
+ for product_name, product_dict in category_dict.items():
+ if product_name in names_to_delete:
+ products_to_delete_names.append((category_name, product_name))
+
+ for website_name, website_dict in product_dict.items():
+ if website_dict["info"]["id"] in ids_to_delete:
+ products_to_delete_ids.append((category_name, product_name, website_name))
+
+ return categories_to_delete, products_to_delete_names, products_to_delete_ids
+
+
+def delete_dataframe_rows(products_df: pd.DataFrame, df_search_column: str, delete_value: str) -> pd.DataFrame:
+ # get the indexes to keep (not delete)
+ indexes_to_keep = products_df.index[products_df[df_search_column] != delete_value].tolist()
+ # get new dataframe with only the indexes to keep
+ new_products_df = products_df.loc[indexes_to_keep]
+ return new_products_df
diff --git a/scraper/reset_data.py b/scraper/reset_data.py
index f5327e99..790d1590 100644
--- a/scraper/reset_data.py
+++ b/scraper/reset_data.py
@@ -1,26 +1,55 @@
+from typing import List
import logging
+
from scraper import Filemanager
-def reset():
+def reset(categories: List[str], names: List[str], ids: List[str], all: bool) -> None:
print("Resetting data...")
- logging.getLogger(__name__).info("Resetting data")
- data = Filemanager.get_record_data()
+ record_data = Filemanager.get_record_data()
+
+ if all:
+ logging.getLogger(__name__).info("Resetting all products")
+ reset_all(record_data)
+ return
+
+ logging.getLogger(__name__).info(f"Resetting categories: {categories}, product names: {names} and product ids: {ids}")
+
+ for category_name, category_dict in record_data.items():
+ if category_name in categories:
+ reset_category(category_dict)
+ continue
+
+ for product_name, product_dict in category_dict.items():
+ if product_name in names:
+ reset_product(product_dict)
+ continue
+
+ for website_dict in product_dict.values():
+ if str(website_dict["info"]["id"]) in ids:
+ reset_product_website(website_dict)
+
+ Filemanager.save_record_data(record_data)
+
+
+def reset_all(record_data: dict) -> None:
+ for category_dict in record_data.values():
+ reset_category(category_dict)
+
+ Filemanager.save_record_data(record_data)
+
- for category in data.values():
- for product in category.values():
- for website in product.values():
- website["info"] = {"id": "", "url": "", "currency": ""}
- website["datapoints"] = []
+def reset_category(category_dict: dict) -> None:
+ for product_dict in category_dict.values():
+ reset_product(product_dict)
- Filemanager.save_record_data(data)
+def reset_product(product_dict: dict) -> None:
+ for website_dict in product_dict.values():
+ reset_product_website(website_dict)
-def hard_reset():
- print("Hard resetting data...")
- logging.getLogger(__name__).info("Hard resetting data")
- data = {}
- Filemanager.save_record_data(data)
- Filemanager.clear_product_csv()
+def reset_product_website(website_dict: dict) -> None:
+ website_dict["info"] = {"id": "", "url": "", "currency": ""}
+ website_dict["datapoints"] = []