diff --git a/README.md b/README.md index 82bdb4ad..2628117a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ - [Intro](#intro) - [Contributing](#contributing) - [Installation](#installation) -- [Start from scratch](#start-scratch) +- [Delete data](#delete-data) - [Scrape products](#scrape-products) - [Add products](#add-products) - [Websites to scrape from](#websites-to-scrape-from) @@ -67,22 +67,47 @@ pip3 install -r requirements.txt
-## Start from scratch +## Delete data + If you want to start from scratch with no data in the records.json and products.csv files, then just run the following command: ``` -python3 main.py --hard-reset +python3 main.py --delete --all +``` + +You can also just delete some products or some categories: +``` +python3 main.py --delete --id +``` ``` +python3 main.py --delete --name +``` +``` +python3 main.py --delete --category +``` + Then just add products like described [here](#add-products).
-If you just want to reset your data for each product, just delete all datapoints inside each product, then run this command: +If you just want to reset your data for every product, deleting all datapoints inside every product, then run this command: ``` -python3 main.py --reset +python3 main.py --reset --all ``` This deletes the data inside each product, such as id, url and all datapoints. + +You can also just reset some products or all products in some categories: +``` +python3 main.py --reset --id +``` +``` +python3 main.py --reset --name +``` +``` +python3 main.py --reset --category +``` +
diff --git a/main.py b/main.py index 7c0752e3..b9f595f4 100644 --- a/main.py +++ b/main.py @@ -15,10 +15,7 @@ def main(): scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare) if args.reset: - scraper.reset() - - if args.hard_reset: - scraper.hard_reset() + scraper.reset(args.category, args.name, args.id, args.all) if args.add: scraper.add_products(args.category, args.url) @@ -38,6 +35,9 @@ def main(): if args.print_all_products: scraper.print_all_products() + if args.delete: + scraper.delete(args.category, args.name, args.id, args.all) + def scrape(): print("Scraping...") diff --git a/scraper/__init__.py b/scraper/__init__.py index f20873e2..034befcf 100644 --- a/scraper/__init__.py +++ b/scraper/__init__.py @@ -4,7 +4,8 @@ from .filemanager import Filemanager, Config from .visualize import visualize_data from .clean_data import clean_records_data -from .reset_data import reset, hard_reset +from .delete_data import delete +from .reset_data import reset from .search_data import search from .misc import print_latest_datapoints, print_all_products diff --git a/scraper/arguments.py b/scraper/arguments.py index 79910e8f..b3cff5b1 100644 --- a/scraper/arguments.py +++ b/scraper/arguments.py @@ -28,6 +28,7 @@ def argparse_setup() -> ArgumentParser.parse_args: type=str, nargs="*", action="extend", + default=[], ) parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend") @@ -99,12 +100,6 @@ def argparse_setup() -> ArgumentParser.parse_args: action="store_true", ) - parser.add_argument( - "--hard-reset", - help="delete all content in records.json and products.csv", - action="store_true", - ) - parser.add_argument( "--clean-data", help="clean data so unnecessary product datapoints is removed from records", @@ -126,6 +121,13 @@ def argparse_setup() -> ArgumentParser.parse_args: action="store_true", ) + parser.add_argument( + "--delete", + help="delete all or specific products or categories", + dest="delete", + action="store_true", + ) + args = validate_arguments(parser) return args @@ -138,6 +140,10 @@ def validate_arguments(parser: ArgumentParser) -> None: if args.add and args.visualize: parser.error("Cannot use --add and --visualize at the same time") + if args.delete: + if args.all and any([args.category, args.name, args.id]): + parser.error("When using --delete and --all, then using --category, --name or --id does nothing") + if args.add: if not args.category or not args.url: parser.error("When using --add, then --category and --url is required") diff --git a/scraper/delete_data.py b/scraper/delete_data.py new file mode 100644 index 00000000..37a568be --- /dev/null +++ b/scraper/delete_data.py @@ -0,0 +1,103 @@ +from typing import List, Tuple +import logging +import pandas as pd + +from scraper.filemanager import Filemanager + + +def delete(categories: List[str], names: List[str], ids: List[str], all: bool) -> None: + print("Deleting...") + record_data = Filemanager.get_record_data() + + if all: + print("Deleting all products and categories...") + logging.getLogger(__name__).info("Deleting all products and categories") + delete_all() + return + + logging.getLogger(__name__).info(f"Deleting categories: {categories}, product names: {names} and product ids: {ids}") + delete_from_record_data(record_data, categories, names, ids) + + Filemanager.save_record_data(record_data) + + +def delete_all() -> None: + # Save an empty dictionary + Filemanager.save_record_data({}) + Filemanager.clear_product_csv() + + +def delete_from_record_data( + record_data: dict, + category_names_to_delete: List[str] = None, + names_to_delete: List[str] = None, + ids_to_delete: List[str] = None, +) -> None: + category_names_to_delete = [] if category_names_to_delete is None else category_names_to_delete + names_to_delete = [] if names_to_delete is None else names_to_delete + ids_to_delete = [] if ids_to_delete is None else ids_to_delete + + categories_to_delete, products_to_delete_names, products_to_delete_ids = get_categories_products_ids_to_delete( + record_data, category_names_to_delete, names_to_delete, ids_to_delete + ) + + products_df = Filemanager.get_products_data() + + # Delete product ids + for product_to_delete_id in products_to_delete_ids: + category_name, product_name, website_name = product_to_delete_id + deleted_website_dict = record_data[category_name][product_name].pop(website_name) + + # Delete the row with short_url in products_df that match with the product's url that are deleted from record_data + url_to_delete = deleted_website_dict["info"]["url"] + products_df = delete_dataframe_rows(products_df, "short_url", url_to_delete) + + # Delete product names + for product_to_delete_name in products_to_delete_names: + category_name, product_name = product_to_delete_name + deleted_product_dict = record_data[category_name].pop(product_name) + + # Delete rows with short_url in products_df that match with the products urls that are deleted from record_data + for deleted_website_dict in deleted_product_dict.values(): + url_to_delete = deleted_website_dict["info"]["url"] + products_df = delete_dataframe_rows(products_df, "short_url", url_to_delete) + + # Delete categories + for category_to_delete in categories_to_delete: + record_data.pop(category_to_delete) + + # Delete rows with category in products_df that match with the category to delete + products_df = delete_dataframe_rows(products_df, "category", category_to_delete) + + Filemanager.save_products_data(products_df) + + +def get_categories_products_ids_to_delete( + record_data, category_names_to_delete, names_to_delete, ids_to_delete +) -> Tuple[List[str], List[Tuple[str, str]], List[Tuple[str, str, str]]]: + categories_to_delete: List[str] = [] + products_to_delete_names: List[Tuple[str, str]] = [] + products_to_delete_ids: List[Tuple[str, str, str]] = [] + + # Find the "paths" for categories, product names and ids to delete + for category_name, category_dict in record_data.items(): + if category_name in category_names_to_delete: + categories_to_delete.append(category_name) + + for product_name, product_dict in category_dict.items(): + if product_name in names_to_delete: + products_to_delete_names.append((category_name, product_name)) + + for website_name, website_dict in product_dict.items(): + if website_dict["info"]["id"] in ids_to_delete: + products_to_delete_ids.append((category_name, product_name, website_name)) + + return categories_to_delete, products_to_delete_names, products_to_delete_ids + + +def delete_dataframe_rows(products_df: pd.DataFrame, df_search_column: str, delete_value: str) -> pd.DataFrame: + # get the indexes to keep (not delete) + indexes_to_keep = products_df.index[products_df[df_search_column] != delete_value].tolist() + # get new dataframe with only the indexes to keep + new_products_df = products_df.loc[indexes_to_keep] + return new_products_df diff --git a/scraper/reset_data.py b/scraper/reset_data.py index f5327e99..790d1590 100644 --- a/scraper/reset_data.py +++ b/scraper/reset_data.py @@ -1,26 +1,55 @@ +from typing import List import logging + from scraper import Filemanager -def reset(): +def reset(categories: List[str], names: List[str], ids: List[str], all: bool) -> None: print("Resetting data...") - logging.getLogger(__name__).info("Resetting data") - data = Filemanager.get_record_data() + record_data = Filemanager.get_record_data() + + if all: + logging.getLogger(__name__).info("Resetting all products") + reset_all(record_data) + return + + logging.getLogger(__name__).info(f"Resetting categories: {categories}, product names: {names} and product ids: {ids}") + + for category_name, category_dict in record_data.items(): + if category_name in categories: + reset_category(category_dict) + continue + + for product_name, product_dict in category_dict.items(): + if product_name in names: + reset_product(product_dict) + continue + + for website_dict in product_dict.values(): + if str(website_dict["info"]["id"]) in ids: + reset_product_website(website_dict) + + Filemanager.save_record_data(record_data) + + +def reset_all(record_data: dict) -> None: + for category_dict in record_data.values(): + reset_category(category_dict) + + Filemanager.save_record_data(record_data) + - for category in data.values(): - for product in category.values(): - for website in product.values(): - website["info"] = {"id": "", "url": "", "currency": ""} - website["datapoints"] = [] +def reset_category(category_dict: dict) -> None: + for product_dict in category_dict.values(): + reset_product(product_dict) - Filemanager.save_record_data(data) +def reset_product(product_dict: dict) -> None: + for website_dict in product_dict.values(): + reset_product_website(website_dict) -def hard_reset(): - print("Hard resetting data...") - logging.getLogger(__name__).info("Hard resetting data") - data = {} - Filemanager.save_record_data(data) - Filemanager.clear_product_csv() +def reset_product_website(website_dict: dict) -> None: + website_dict["info"] = {"id": "", "url": "", "currency": ""} + website_dict["datapoints"] = []