Merge branch 'main' of https://github.com/SonyShrestha/VBP_Joint_Project

SonyShrestha · Apr 6, 2024 · 4d05596 · 4d05596
2 parents 8cfe813 + 9d79cb4
commit 4d05596
Show file tree

Hide file tree

Showing 11 changed files with 10,103 additions and 1 deletion.
diff --git a/.DS_Store b/.DS_Store
diff --git a/config.ini b/config.ini
@@ -7,6 +7,9 @@ url=https://eatbydate.com/
 [BIG_BASKET]
 dataset_path=surajjha101/bigbasket-entire-product-list-28k-datapoints
 
+[SM_RETAIL_CUSTOMERS]
+dataset_path=mohamedharris/supermart-grocery-sales-retail-analytics-dataset
+
 [CUSTOMER_PURCHASE]
 num_of_customers=500
 num_of_purchases=100000
@@ -20,4 +23,10 @@ num_of_reviews=1000
 [FLIPKART]
 BASE_URL= https://www.flipkart.com
 URL= https://www.flipkart.com/grocery-supermart-store?marketplace=GROCERY
-HEADER= ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en, en-US, en;g=0.5'})
+HEADER= ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en, en-US, en;g=0.5'})
+
+
+[SUPERMARKET_PRODUCT_PARAMS]
+number_of_products = 40
+quantity_min = 1
+quantity_max = 100
diff --git a/data/.DS_Store b/data/.DS_Store
diff --git a/data/raw/sm_retail_customers.csv b/data/raw/sm_retail_customers.csv
diff --git a/landing_zone/.DS_Store b/landing_zone/.DS_Store
diff --git a/landing_zone/OCR/.DS_Store b/landing_zone/OCR/.DS_Store
diff --git a/landing_zone/collectors/.DS_Store b/landing_zone/collectors/.DS_Store
diff --git a/landing_zone/collectors/Flipkart/.DS_Store b/landing_zone/collectors/Flipkart/.DS_Store
diff --git a/landing_zone/collectors/customers/sm_retail_customer.py b/landing_zone/collectors/customers/sm_retail_customer.py
@@ -0,0 +1,30 @@
+import kaggle
+import logging
+import configparser
+import  os
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)  # Set log level to INFO
+
+# Create logger object
+logger = logging.getLogger()
+
+# Get the path to the parent parent directory
+config_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir))
+
+# Specify the path to config file
+config_file_path = os.path.join(config_dir, "config.ini")
+
+config = configparser.ConfigParser()
+config.read(config_file_path)
+
+def download_kaggle_dataset(dataset_path,raw_data_dir):
+    logger.info('-----------------------------------------------------')
+    logger.info("Downloading customer data from kaggle dataset for Supermart Grocery Sales - Retail Analytics")
+    kaggle.api.dataset_download_files(dataset=dataset_path, path=raw_data_dir, unzip=True)
+    os.rename(os.path.join(raw_data_dir, 'Supermart Grocery Sales - Retail Analytics Dataset.csv'), os.path.join(raw_data_dir, 'sm_retail_customers.csv'))
+
+if __name__ == "__main__":
+    dataset_path = config["SM_RETAIL_CUSTOMERS"]["dataset_path"]
+    raw_data_dir = config["COMMON"]["raw_data_dir"]
+    download_kaggle_dataset(dataset_path,raw_data_dir)
diff --git a/landing_zone/synthetic/.DS_Store b/landing_zone/synthetic/.DS_Store
diff --git a/landing_zone/synthetic/supermarket_products/supermarket_products.py b/landing_zone/synthetic/supermarket_products/supermarket_products.py
@@ -0,0 +1,68 @@
+import csv
+import json
+import random
+import configparser
+import os
+import logging
+
+
+
+logger = logging.getLogger()
+
+# Load configuration
+config_path = os.path.join(os.getcwd(), '../../..', 'config.ini')
+config = configparser.ConfigParser()
+config.read(config_path)
+logging.info(f'Configuration loaded from {config_path}')
+
+
+# Base directory for files
+raw_data_dir = config.get('COMMON', 'raw_data_dir')
+
+number_of_products = config.getint('SUPERMARKET_PRODUCT_PARAMS', 'number_of_products')
+quantity_min = config.getint('SUPERMARKET_PRODUCT_PARAMS', 'quantity_min')
+quantity_max = config.getint('SUPERMARKET_PRODUCT_PARAMS', 'quantity_max')
+
+# Reading file paths from config file
+products_json = os.path.join(raw_data_dir,'flipkarts_products.json' )
+stores_csv = os.path.join(raw_data_dir, 'establishments_catalonia.csv')
+output_csv = os.path.join(raw_data_dir, 'assigned_products.csv')
+
+# Load JSON data from file
+with open(products_json, 'r') as file:
+    products = json.load(file)
+
+# Load CSV data from file and filter rows
+supermarkets = []
+with open(stores_csv, 'r') as file:
+    reader = csv.DictReader(file)
+    for row in reader:
+        if "supermercat" in row['Activity_description'].lower():
+            supermarkets.append(row)
+
+# Assign products randomly
+assigned_products = []
+for supermarket in supermarkets:
+    selected_products = random.sample(products, number_of_products)
+    for product in selected_products:
+        assigned_products.append({
+            "store_id": supermarket['Id'],
+            "store_name": supermarket['Commercial_name'],
+            "product_id": product['product_id'],
+            "product_name": product['name'],
+            "manufacture_date": product['manufacturing_date'],
+            "expiry_date": product['expiry_date'],
+            "quantity": random.randint(quantity_min, quantity_max)  # Random quantity between 1 and 100
+        })
+
+# Output to CSV
+try:
+    with open(output_csv, 'w', newline='') as file:
+        fieldnames = ['store_id', 'store_name', 'product_id', 'product_name', 'manufacture_date', 'expiry_date', 'quantity']
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        for item in assigned_products:
+            writer.writerow(item)
+    logger.info(f"Data has been processed and output to {output_csv}.")
+except Exception as e:
+    logger.error("Failed to write data to CSV", exc_info=True)