Skip to content

Commit

Permalink
Merge pull request #183 from Crinibus/add-argument-delete
Browse files Browse the repository at this point in the history
Add argument --delete and update --reset
  • Loading branch information
Crinibus authored Oct 16, 2022
2 parents af22bdd + 5424161 commit aa714ba
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 31 deletions.
35 changes: 30 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
- [Intro](#intro)
- [Contributing](#contributing)
- [Installation](#installation)
- [Start from scratch](#start-scratch)
- [Delete data](#delete-data)
- [Scrape products](#scrape-products)
- [Add products](#add-products)
- [Websites to scrape from](#websites-to-scrape-from)
Expand Down Expand Up @@ -67,22 +67,47 @@ pip3 install -r requirements.txt
<br/>


## Start from scratch <a name="start-scratch"></a>
## Delete data <a name="delete-data"></a>

If you want to start from scratch with no data in the records.json and products.csv files, then just run the following command:
```
python3 main.py --hard-reset
python3 main.py --delete --all
```

You can also just delete some products or some categories:
```
python3 main.py --delete --id <id>
```
```
python3 main.py --delete --name <name>
```
```
python3 main.py --delete --category <category>
```


Then just add products like described [here](#add-products).

<br/>

If you just want to reset your data for each product, just delete all datapoints inside each product, then run this command:
If you just want to reset your data for every product, deleting all datapoints inside every product, then run this command:
```
python3 main.py --reset
python3 main.py --reset --all
```
This deletes the data inside each product, such as id, url and all datapoints.


You can also just reset some products or all products in some categories:
```
python3 main.py --reset --id <id>
```
```
python3 main.py --reset --name <name>
```
```
python3 main.py --reset --category <category>
```

<br/>


Expand Down
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@ def main():
scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare)

if args.reset:
scraper.reset()

if args.hard_reset:
scraper.hard_reset()
scraper.reset(args.category, args.name, args.id, args.all)

if args.add:
scraper.add_products(args.category, args.url)
Expand All @@ -38,6 +35,9 @@ def main():
if args.print_all_products:
scraper.print_all_products()

if args.delete:
scraper.delete(args.category, args.name, args.id, args.all)


def scrape():
print("Scraping...")
Expand Down
3 changes: 2 additions & 1 deletion scraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from .filemanager import Filemanager, Config
from .visualize import visualize_data
from .clean_data import clean_records_data
from .reset_data import reset, hard_reset
from .delete_data import delete
from .reset_data import reset
from .search_data import search
from .misc import print_latest_datapoints, print_all_products

Expand Down
18 changes: 12 additions & 6 deletions scraper/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def argparse_setup() -> ArgumentParser.parse_args:
type=str,
nargs="*",
action="extend",
default=[],
)

parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend")
Expand Down Expand Up @@ -99,12 +100,6 @@ def argparse_setup() -> ArgumentParser.parse_args:
action="store_true",
)

parser.add_argument(
"--hard-reset",
help="delete all content in records.json and products.csv",
action="store_true",
)

parser.add_argument(
"--clean-data",
help="clean data so unnecessary product datapoints is removed from records",
Expand All @@ -126,6 +121,13 @@ def argparse_setup() -> ArgumentParser.parse_args:
action="store_true",
)

parser.add_argument(
"--delete",
help="delete all or specific products or categories",
dest="delete",
action="store_true",
)

args = validate_arguments(parser)

return args
Expand All @@ -138,6 +140,10 @@ def validate_arguments(parser: ArgumentParser) -> None:
if args.add and args.visualize:
parser.error("Cannot use --add and --visualize at the same time")

if args.delete:
if args.all and any([args.category, args.name, args.id]):
parser.error("When using --delete and --all, then using --category, --name or --id does nothing")

if args.add:
if not args.category or not args.url:
parser.error("When using --add, then --category and --url is required")
Expand Down
103 changes: 103 additions & 0 deletions scraper/delete_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from typing import List, Tuple
import logging
import pandas as pd

from scraper.filemanager import Filemanager


def delete(categories: List[str], names: List[str], ids: List[str], all: bool) -> None:
print("Deleting...")
record_data = Filemanager.get_record_data()

if all:
print("Deleting all products and categories...")
logging.getLogger(__name__).info("Deleting all products and categories")
delete_all()
return

logging.getLogger(__name__).info(f"Deleting categories: {categories}, product names: {names} and product ids: {ids}")
delete_from_record_data(record_data, categories, names, ids)

Filemanager.save_record_data(record_data)


def delete_all() -> None:
# Save an empty dictionary
Filemanager.save_record_data({})
Filemanager.clear_product_csv()


def delete_from_record_data(
record_data: dict,
category_names_to_delete: List[str] = None,
names_to_delete: List[str] = None,
ids_to_delete: List[str] = None,
) -> None:
category_names_to_delete = [] if category_names_to_delete is None else category_names_to_delete
names_to_delete = [] if names_to_delete is None else names_to_delete
ids_to_delete = [] if ids_to_delete is None else ids_to_delete

categories_to_delete, products_to_delete_names, products_to_delete_ids = get_categories_products_ids_to_delete(
record_data, category_names_to_delete, names_to_delete, ids_to_delete
)

products_df = Filemanager.get_products_data()

# Delete product ids
for product_to_delete_id in products_to_delete_ids:
category_name, product_name, website_name = product_to_delete_id
deleted_website_dict = record_data[category_name][product_name].pop(website_name)

# Delete the row with short_url in products_df that match with the product's url that are deleted from record_data
url_to_delete = deleted_website_dict["info"]["url"]
products_df = delete_dataframe_rows(products_df, "short_url", url_to_delete)

# Delete product names
for product_to_delete_name in products_to_delete_names:
category_name, product_name = product_to_delete_name
deleted_product_dict = record_data[category_name].pop(product_name)

# Delete rows with short_url in products_df that match with the products urls that are deleted from record_data
for deleted_website_dict in deleted_product_dict.values():
url_to_delete = deleted_website_dict["info"]["url"]
products_df = delete_dataframe_rows(products_df, "short_url", url_to_delete)

# Delete categories
for category_to_delete in categories_to_delete:
record_data.pop(category_to_delete)

# Delete rows with category in products_df that match with the category to delete
products_df = delete_dataframe_rows(products_df, "category", category_to_delete)

Filemanager.save_products_data(products_df)


def get_categories_products_ids_to_delete(
record_data, category_names_to_delete, names_to_delete, ids_to_delete
) -> Tuple[List[str], List[Tuple[str, str]], List[Tuple[str, str, str]]]:
categories_to_delete: List[str] = []
products_to_delete_names: List[Tuple[str, str]] = []
products_to_delete_ids: List[Tuple[str, str, str]] = []

# Find the "paths" for categories, product names and ids to delete
for category_name, category_dict in record_data.items():
if category_name in category_names_to_delete:
categories_to_delete.append(category_name)

for product_name, product_dict in category_dict.items():
if product_name in names_to_delete:
products_to_delete_names.append((category_name, product_name))

for website_name, website_dict in product_dict.items():
if website_dict["info"]["id"] in ids_to_delete:
products_to_delete_ids.append((category_name, product_name, website_name))

return categories_to_delete, products_to_delete_names, products_to_delete_ids


def delete_dataframe_rows(products_df: pd.DataFrame, df_search_column: str, delete_value: str) -> pd.DataFrame:
# get the indexes to keep (not delete)
indexes_to_keep = products_df.index[products_df[df_search_column] != delete_value].tolist()
# get new dataframe with only the indexes to keep
new_products_df = products_df.loc[indexes_to_keep]
return new_products_df
59 changes: 44 additions & 15 deletions scraper/reset_data.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,55 @@
from typing import List
import logging

from scraper import Filemanager


def reset():
def reset(categories: List[str], names: List[str], ids: List[str], all: bool) -> None:
print("Resetting data...")
logging.getLogger(__name__).info("Resetting data")

data = Filemanager.get_record_data()
record_data = Filemanager.get_record_data()

if all:
logging.getLogger(__name__).info("Resetting all products")
reset_all(record_data)
return

logging.getLogger(__name__).info(f"Resetting categories: {categories}, product names: {names} and product ids: {ids}")

for category_name, category_dict in record_data.items():
if category_name in categories:
reset_category(category_dict)
continue

for product_name, product_dict in category_dict.items():
if product_name in names:
reset_product(product_dict)
continue

for website_dict in product_dict.values():
if str(website_dict["info"]["id"]) in ids:
reset_product_website(website_dict)

Filemanager.save_record_data(record_data)


def reset_all(record_data: dict) -> None:
for category_dict in record_data.values():
reset_category(category_dict)

Filemanager.save_record_data(record_data)


for category in data.values():
for product in category.values():
for website in product.values():
website["info"] = {"id": "", "url": "", "currency": ""}
website["datapoints"] = []
def reset_category(category_dict: dict) -> None:
for product_dict in category_dict.values():
reset_product(product_dict)

Filemanager.save_record_data(data)

def reset_product(product_dict: dict) -> None:
for website_dict in product_dict.values():
reset_product_website(website_dict)

def hard_reset():
print("Hard resetting data...")
logging.getLogger(__name__).info("Hard resetting data")

data = {}
Filemanager.save_record_data(data)
Filemanager.clear_product_csv()
def reset_product_website(website_dict: dict) -> None:
website_dict["info"] = {"id": "", "url": "", "currency": ""}
website_dict["datapoints"] = []

0 comments on commit aa714ba

Please sign in to comment.