From 12fb4e4ded3cef248b3d66a21911b65f28e094cb Mon Sep 17 00:00:00 2001 From: raj-4041 Date: Tue, 23 Sep 2025 15:28:33 +1000 Subject: [PATCH] time seires deal pred Signed-off-by: raj-4041 --- .../Raj's_Time_series_deal_pred.py | 4600 +++++++++++++++++ 1 file changed, 4600 insertions(+) create mode 100644 ML/Price-Prediction/Raj's_Time_series_deal_pred.py diff --git a/ML/Price-Prediction/Raj's_Time_series_deal_pred.py b/ML/Price-Prediction/Raj's_Time_series_deal_pred.py new file mode 100644 index 00000000..e67fb4d2 --- /dev/null +++ b/ML/Price-Prediction/Raj's_Time_series_deal_pred.py @@ -0,0 +1,4600 @@ +# -*- coding: utf-8 -*- +"""Untitled.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1eKrTU7EnUkhJFvzTfmnu2bsxvN5X2wtC +""" + +# %% +import pandas as pd +import numpy as np +import re +from datetime import datetime, timedelta +import matplotlib.pyplot as plt +import seaborn as sns +import plotly.express as px +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from statsmodels.tsa.seasonal import seasonal_decompose +import warnings +warnings.filterwarnings('ignore') + +# %% +# CONFIGURATION AND DATA STRUCTURES +# ================================= + +# Function to parse weights to grams +def parse_to_grams(s): + """Convert weight strings to grams for standardization""" + if pd.isna(s): + return None + + s = str(s).strip().lower() + match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', s) + + if not match: + if 'pack' in s or 'each' in s: + return 1 + return None + + num_str = match.group(1) + unit = match.group(2).lower() + + try: + num = float(num_str) + except ValueError: + if 'pack' in s or 'each' in s: + return 1 + return None + + # Unit conversions + if unit in ['g', 'gram', 'grams']: + return num + elif unit in ['kg', 'kilogram', 'kilograms']: + return num * 1000 + elif unit in ['ml', 'milliliter', 'milliliters']: + return num # assume 1g/ml + elif unit in ['l', 'liter', 'liters']: + return num * 1000 + elif unit in ['pack', 'each']: + return num + else: + return None + +# %% +# HOLIDAY AND EVENT DEFINITIONS +# ============================ + +# Major holidays affecting grocery pricing +holidays = { + 'New Year': [ + datetime(2023, 1, 1), datetime(2024, 1, 1), datetime(2025, 1, 1) + ], + 'Australia Day': [ + datetime(2023, 1, 26), datetime(2024, 1, 26), datetime(2025, 1, 26) + ], + 'Easter': [ + datetime(2023, 4, 7), datetime(2023, 4, 10), + datetime(2024, 3, 29), datetime(2024, 4, 1), + datetime(2025, 4, 18), datetime(2025, 4, 21) + ], + 'Anzac Day': [ + datetime(2023, 4, 25), datetime(2024, 4, 25), datetime(2025, 4, 25) + ], + 'Christmas': [ + datetime(2023, 12, 25), datetime(2023, 12, 26), + datetime(2024, 12, 25), datetime(2024, 12, 26), + datetime(2025, 12, 25), datetime(2025, 12, 26) + ], + 'Chinese New Year': [ + datetime(2023, 2, 22), datetime(2024, 2, 10), datetime(2025, 1, 29) + ], + 'Mother Day': [ + datetime(2023, 5, 14), datetime(2024, 5, 12), datetime(2025, 5, 11) + ], + 'Father Day': [ + datetime(2023, 9, 3), datetime(2024, 9, 1), datetime(2025, 9, 7) + ], + 'Labour Day': [ + datetime(2023, 10, 2), datetime(2024, 10, 7), datetime(2025, 10, 6) + ], + 'King Birthday': [ + datetime(2023, 6, 12), datetime(2024, 6, 10), datetime(2025, 6, 9) + ], + 'Melbourne Cup': [ + datetime(2023, 11, 7), datetime(2024, 11, 5), datetime(2025, 11, 4) + ], + 'Diwali': [ + datetime(2023, 11, 12), datetime(2024, 11, 1), datetime(2025, 10, 20) + ], + 'Halloween': [ + datetime(2023, 10, 31), datetime(2024, 10, 31), datetime(2025, 10, 31) + ], + 'Valentine Day': [ + datetime(2023, 2, 14), datetime(2024, 2, 14), datetime(2025, 2, 14) + ], + 'Back to School': [ + datetime(2023, 1, 30), datetime(2023, 7, 24), + datetime(2024, 1, 29), datetime(2024, 7, 22), + datetime(2025, 1, 27), datetime(2025, 7, 21) + ], + 'School Holidays': [ + datetime(2023, 4, 10), datetime(2023, 7, 1), datetime(2023, 9, 25), datetime(2023, 12, 18), + datetime(2024, 3, 28), datetime(2024, 7, 6), datetime(2024, 9, 23), datetime(2024, 12, 16), + datetime(2025, 4, 14), datetime(2025, 7, 5), datetime(2025, 9, 22), datetime(2025, 12, 15) + ] +} + +# Supply chain disruption events +supply_chain_events = { + 'Suez Canal': [datetime(2023, 3, 25), datetime(2023, 3, 29)], + 'Shipping Delays': [datetime(2023, 8, 15), datetime(2023, 8, 25)], + 'Port Strikes': [datetime(2024, 2, 10), datetime(2024, 2, 20)], + 'Fuel Price Spike': [datetime(2024, 9, 1), datetime(2024, 9, 15)], + 'Container Shortage': [datetime(2023, 11, 1), datetime(2023, 11, 30)], + 'COVID Lockdown': [datetime(2023, 5, 1), datetime(2023, 5, 14)], + 'Truck Driver Strike': [datetime(2024, 6, 15), datetime(2024, 6, 25)], + 'Factory Fire': [datetime(2024, 11, 10), datetime(2024, 11, 20)] +} + +# Weather events affecting agriculture and supply +weather_events = { + 'Flood Queensland': [datetime(2023, 2, 15), datetime(2023, 3, 15)], + 'Drought NSW': [datetime(2023, 6, 1), datetime(2023, 8, 31)], + 'Cyclone WA': [datetime(2024, 1, 20), datetime(2024, 2, 5)], + 'Heatwave Victoria': [datetime(2024, 12, 15), datetime(2025, 1, 15)], + 'Frost Tasmania': [datetime(2023, 9, 1), datetime(2023, 9, 30)], + 'Bushfire NSW': [datetime(2024, 10, 1), datetime(2024, 10, 31)], + 'Heavy Rain Melbourne': [datetime(2024, 3, 10), datetime(2024, 3, 25)], + 'Extreme Heat Adelaide': [datetime(2025, 2, 1), datetime(2025, 2, 14)] +} + +# Disease/pest outbreaks +disease_events = { + 'Avian Flu': [datetime(2023, 7, 1), datetime(2023, 9, 30)], + 'Foot and Mouth Scare': [datetime(2024, 4, 1), datetime(2024, 4, 30)], + 'White Spot Prawns': [datetime(2023, 10, 15), datetime(2023, 11, 15)], + 'Banana Disease': [datetime(2024, 8, 1), datetime(2024, 9, 15)], + 'Citrus Canker': [datetime(2025, 3, 1), datetime(2025, 4, 15)] +} + +# %% +# PRICING MULTIPLIERS BY CATEGORY +# =============================== + +# FIXED: Realistic category multipliers (reduced by 50-70%) +category_multipliers = { + 'Meat & Seafood': { + 'Christmas': 1.3, 'Easter': 1.2, 'Summer': 1.1, 'Winter': 0.95, + 'Avian Flu': 1.25, 'Foot and Mouth Scare': 1.15, 'White Spot Prawns': 1.2, + 'Drought NSW': 1.1, 'Flood Queensland': 1.08, + 'supply_chain_base': 1.15, 'weather_base': 1.1, 'disease_base': 1.2, + 'shock_prob': 0.08, 'shock_var': 0.15 + }, + 'Fruit & Vegetables': { + 'Summer': 0.85, 'Winter': 1.2, 'Christmas': 1.1, 'Chinese New Year': 1.15, + 'Flood Queensland': 1.4, 'Drought NSW': 1.3, 'Cyclone WA': 1.25, + 'Heatwave Victoria': 1.15, 'Frost Tasmania': 1.2, 'Heavy Rain Melbourne': 1.1, + 'Banana Disease': 1.5, 'Citrus Canker': 1.4, + 'supply_chain_base': 1.08, 'weather_base': 1.3, 'disease_base': 1.4, + 'shock_prob': 0.12, 'shock_var': 0.25 + }, + 'Dairy, Eggs & Fridge': { + 'Christmas': 1.15, 'Easter': 1.1, 'Winter': 1.05, 'Back to School': 1.08, + 'Avian Flu': 1.3, 'Drought NSW': 1.15, 'Extreme Heat Adelaide': 1.1, + 'supply_chain_base': 1.1, 'weather_base': 1.08, 'disease_base': 1.25, + 'shock_prob': 0.08, 'shock_var': 0.15 + }, + 'Bakery': { + 'Christmas': 1.2, 'Easter': 1.1, 'School Holidays': 1.08, 'Winter': 1.03, + 'supply_chain_base': 1.05, 'weather_base': 1.03, + 'shock_prob': 0.05, 'shock_var': 0.08 + }, + 'Pantry': { + 'COVID Lockdown': 1.25, 'School Holidays': 1.1, 'Back to School': 1.15, + 'Container Shortage': 1.1, 'Shipping Delays': 1.08, + 'supply_chain_base': 1.08, 'weather_base': 1.03, + 'shock_prob': 0.05, 'shock_var': 0.1 + }, + 'Health & Beauty': { + 'New Year': 1.25, 'Valentine Day': 1.1, 'Winter': 1.08, + 'supply_chain_base': 1.03, 'shock_prob': 0.03, 'shock_var': 0.05 + }, + 'Drinks': { + 'Summer': 1.3, 'Christmas': 1.35, 'Australia Day': 1.15, 'Melbourne Cup': 1.1, + 'Heatwave Victoria': 1.2, 'Extreme Heat Adelaide': 1.18, + 'supply_chain_base': 1.08, 'weather_base': 1.1, + 'shock_prob': 0.05, 'shock_var': 0.1 + }, + 'Frozen': { + 'Summer': 1.5, 'Heatwave Victoria': 1.3, 'Extreme Heat Adelaide': 1.25, + 'Christmas': 1.3, 'School Holidays': 1.2, + 'supply_chain_base': 1.1, 'weather_base': 1.25, + 'shock_prob': 0.08, 'shock_var': 0.15 + }, + 'Deli': { + 'Christmas': 1.4, 'Easter': 1.25, 'Melbourne Cup': 1.18, 'King Birthday': 1.1, + 'supply_chain_base': 1.08, 'shock_prob': 0.06, 'shock_var': 0.12 + }, + 'Household': { + 'Back to School': 1.15, 'Spring': 1.1, 'COVID Lockdown': 1.2, + 'supply_chain_base': 1.05, 'shock_prob': 0.04, 'shock_var': 0.08 + } +} + +# %% +# SUBCATEGORY-SPECIFIC SEASONAL EFFECTS +# ==================================== + +# FIXED: Realistic subcategory seasonal effects (reduced) +subcat_seasonal_effects = { + 'Fruit': {'Summer': 0.7, 'Winter': 1.4}, # Reduced from 0.4/2.5 + 'Vegetables (Leafy/Salad)': {'Summer': 1.15, 'Winter': 0.95}, # Reduced from 1.4/0.9 + 'Vegetables (Root/Onion/Garlic)': {'Winter': 0.9, 'Summer': 1.08}, # Reduced from 0.8/1.2 + 'Vegetables (Fruiting)': {'Summer': 0.8, 'Winter': 1.2}, # Reduced from 0.6/1.5 + 'Lamb': {'Easter': 1.4, 'Christmas': 1.25}, # Reduced from 2.0/1.6 + 'Turkey': {'Christmas': 1.8, 'Easter': 1.05}, # Reduced from 3.0/1.2 + 'Fish': {'Christmas': 1.5, 'Easter': 1.3, 'Summer': 1.1}, # Reduced from 2.4/1.8/1.3 + 'Prawns': {'Christmas': 2.0, 'Chinese New Year': 1.6}, # Reduced from 3.5/2.2 + 'Ice Cream': {'Summer': 2.2, 'Heatwave Victoria': 1.8, 'Winter': 0.5}, # Reduced from 4.0/3.0/0.3 + 'Frozen Vegetables': {'Winter': 1.1, 'COVID Lockdown': 1.2} # Reduced from 1.3/1.5 +} + +# %% +# SUBCATEGORY PROMOTION PROBABILITIES +# ================================== + +# Dictionary of subcat apply probabilities +subcat_apply_probs = { + 'Pork': 0.4, 'Beef': 0.4, 'Chicken': 0.4, 'Prawns': 0.35, 'Pantry/Other': 0.15, + 'Lamb': 0.35, 'Mixed Meat': 0.3, 'Salmon': 0.35, 'Fish': 0.35, 'Turkey': 0.3, + 'Tuna': 0.3, 'Kangaroo': 0.3, 'Seafood': 0.35, 'Plant-Based': 0.25, 'Veal': 0.3, + 'Duck': 0.3, 'Trout': 0.35, 'Mussels': 0.35, 'Venison': 0.3, 'Wallaby': 0.3, + 'Crab': 0.35, 'Fruit': 0.25, 'Other Items (F&V Section)': 0.2, + 'Vegetables (Fruiting)': 0.25, 'Vegetables (Root/Onion/Garlic)': 0.2, + 'Vegetables (Stem/Flower/Pod)': 0.2, 'Vegetables (Leafy/Salad)': 0.25, + 'Mushrooms': 0.25, 'Herbs/Sprouts': 0.2, 'Value-Added Produce': 0.2, + 'Nuts/Seeds/Dried Fruit': 0.25, 'Yoghurt Specialty': 0.25, 'Butter Standard': 0.2, + 'Milk Specialty': 0.2, 'Cheese Standard': 0.2, 'Cheese Specialty': 0.25, + 'Butter Specialty': 0.2, 'Yoghurt Standard': 0.2, 'Outsider': 0.15, + 'Cream Standard': 0.2, 'Milk Standard': 0.15, 'Eggs Standard': 0.2, + 'Bread Loaves': 0.15, 'Wraps & Flatbreads': 0.2, 'Cakes & Slices': 0.3, + 'Rolls & Buns': 0.2, 'Savoury Bakery Items': 0.25, 'Sourdough & Artisan Breads': 0.25, + 'Sweet Pastries & Donuts': 0.3, 'Biscuits & Cookies': 0.3, 'Pancakes, Waffles & Crepes': 0.25, + 'Muffins & Cupcakes': 0.3, 'Seafood (Processed/Cooked)': 0.35, 'Bacon': 0.35, + 'Ham': 0.35, 'Platters/Kits': 0.3, 'Chicken (Processed/Cooked)': 0.35, 'Pantry': 0.15, + 'Salami/Pepperoni/Chorizo': 0.25, 'Crackers/Breadsticks': 0.25, 'Antipasto/Olives/Pickles': 0.25, + 'Turkey (Processed/Cooked)': 0.3, 'Beef (Processed/Cooked)': 0.35, 'Frankfurts/Sausages': 0.3, + 'Pork (Processed/Cooked)': 0.35, 'Cheese': 0.2, 'Bakery': 0.25, 'Confectionery': 0.3, + 'Other Deli': 0.2, 'Prepared Meals': 0.25, 'Dips/Pate': 0.25, 'Snacks (Sweet)': 0.3, + 'Canned Goods': 0.15, 'Meal Kits/Bases/Instant Meals': 0.25, 'Breakfast Cereals': 0.25, + 'Pasta/Rice/Noodles/Grains': 0.15, 'Snacks (Savoury)': 0.3, 'Beverages (Shelf-Stable)': 0.25, + 'Spreads/Oils/Condiments': 0.2, 'Baking Mixes': 0.2, 'Baking Ingredients': 0.2, + 'Other Pantry Items': 0.15, 'Juice/Smoothie': 0.25, 'Functional/Health Drink': 0.25, + 'Other Drinks': 0.2, 'Soft Drink/Mixer': 0.3, 'Water': 0.15, 'Milk': 0.15, + 'Non-Drink Item': 0.15, 'Tea': 0.2, 'Coffee': 0.25, 'Alcoholic Beverages (Low/No Alc)': 0.3, + 'Frozen Chips': 0.3, 'Ice Cream': 0.3, 'Frozen Desserts': 0.3, 'Frozen Meat': 0.3, + 'Frozen Poultry': 0.3, 'Frozen Fruits': 0.25, 'Frozen Vegetables': 0.25, + 'Frozen Pastry': 0.25, 'Frozen Meals': 0.25, 'Frozen Seafood': 0.3, 'Other Frozen': 0.2, + 'Stationery': 0.1, 'Dishwashing': 0.15, 'Bags': 0.15, 'Laundry Care': 0.2, + 'Kitchenware & Food Storage': 0.2, 'Paper Products': 0.15, 'Cleaning Solutions & Wipes': 0.2, + 'Cleaning Tools & Accessories': 0.15, 'Home Maintenance & General': 0.15, + 'Air Care & Pest Control': 0.2, 'Vitamins & Supplements': 0.2, 'Skincare': 0.25, + 'Wash Products': 0.2, 'First Aid & Wellness': 0.15, 'Health & Medicines': 0.15, + 'Feminine & Incontinence Care': 0.15, 'Oral Care': 0.2, 'Deodorants & Antiperspirants': 0.25, + 'Hair Care': 0.2, 'Shaving & Hair Removal': 0.2, 'First Aid & Wellness Accessories': 0.15, + 'Deodorants & Body Sprays': 0.25, 'Medicines & Health Treatments': 0.15, + "Shaving & Men's Grooming": 0.2, +} + +# %% +# CORE PRICING AND ANALYSIS FUNCTIONS +# =================================== + +def get_fortnight_col(date): + """Get fortnight column name for discount lookup""" + month_abbr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + month_idx = date.month - 1 + month_name = month_abbr[month_idx] + half = '01' if date.day <= 15 else '02' + return f"{month_name}-{half}" + +def get_season(date): + """Get season for Southern Hemisphere""" + month = date.month + if month in [12, 1, 2]: + return 'Summer' + elif month in [3, 4, 5]: + return 'Autumn' + elif month in [6, 7, 8]: + return 'Winter' + else: + return 'Spring' + +def is_event_active(date, event_dates, days_impact=7): + """Check if date is within impact period of an event""" + for event_date in event_dates: + if isinstance(event_date, list): + if len(event_date) == 2: + start_date, end_date = event_date + if start_date <= date <= end_date + timedelta(days=days_impact): + return True + else: + if abs((date - event_date).days) <= days_impact: + return True + return False + +def apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct): + """Apply comprehensive pricing factors including all real-world events - FIXED VERSION""" + multipliers = category_multipliers.get(category, {}) + subcat_effects = subcat_seasonal_effects.get(subcat, {}) + + mult = 1.0 + event_description = [] + + # Seasonal effects + season = get_season(date) + if season in multipliers: + seasonal_mult = multipliers[season] + mult *= seasonal_mult + if seasonal_mult != 1.0: + event_description.append(f"{season} effect: {seasonal_mult:.2f}x") + + # Subcat seasonal effects (override category if more specific) + if season in subcat_effects: + subcat_mult = subcat_effects[season] + mult = mult / multipliers.get(season, 1.0) * subcat_mult + event_description.append(f"{subcat} {season}: {subcat_mult:.2f}x") + + # Holiday effects - REDUCED PROBABILITY + for holiday, dates in holidays.items(): + if is_event_active(date, dates, days_impact=14): + # Only apply if random chance (reduce from 100% to 40%) + if np.random.rand() < 0.4: + if holiday in multipliers: + holiday_mult = multipliers[holiday] + mult *= holiday_mult + event_description.append(f"{holiday}: {holiday_mult:.2f}x") + elif holiday in subcat_effects: + holiday_mult = subcat_effects[holiday] + mult *= holiday_mult + event_description.append(f"{subcat} {holiday}: {holiday_mult:.2f}x") + + # Supply chain disruptions - REDUCED PROBABILITY + for event, dates in supply_chain_events.items(): + if is_event_active(date, dates, days_impact=21): + # Only apply if random chance (reduce from 100% to 20%) + if np.random.rand() < 0.2: + supply_mult = multipliers.get('supply_chain_base', 1.08) + if event in multipliers: + supply_mult = multipliers[event] + mult *= supply_mult + event_description.append(f"Supply chain ({event}): {supply_mult:.2f}x") + + # Weather events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY + for event, dates in weather_events.items(): + if is_event_active(date, dates, days_impact=30): + # Only apply to relevant categories and reduce probability + weather_categories = ['Fruit & Vegetables', 'Meat & Seafood', 'Dairy, Eggs & Fridge'] + if category in weather_categories and np.random.rand() < 0.25: + weather_mult = multipliers.get('weather_base', 1.1) + if event in multipliers: + weather_mult = multipliers[event] + mult *= weather_mult + event_description.append(f"Weather ({event}): {weather_mult:.2f}x") + + # Disease/pest events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY + for event, dates in disease_events.items(): + if is_event_active(date, dates, days_impact=60): + # Only apply to relevant categories + disease_categories = { + 'Avian Flu': ['Meat & Seafood', 'Dairy, Eggs & Fridge'], + 'Foot and Mouth Scare': ['Meat & Seafood', 'Dairy, Eggs & Fridge'], + 'White Spot Prawns': ['Meat & Seafood'], + 'Banana Disease': ['Fruit & Vegetables'], + 'Citrus Canker': ['Fruit & Vegetables'] + } + relevant_categories = disease_categories.get(event, []) + if category in relevant_categories and np.random.rand() < 0.3: + disease_mult = multipliers.get('disease_base', 1.2) + if event in multipliers: + disease_mult = multipliers[event] + mult *= disease_mult + event_description.append(f"Disease ({event}): {disease_mult:.2f}x") + + # Random market shocks - REDUCED PROBABILITY + shock_prob = multipliers.get('shock_prob', 0.05) + if np.random.rand() < shock_prob: + shock = np.random.normal(0, multipliers.get('shock_var', 0.1)) + shock_mult = 1 + shock + mult *= shock_mult + if abs(shock) > 0.05: # Only log significant shocks + event_description.append(f"Market shock: {shock_mult:.2f}x") + + # Inflation trend (1.5-3% annually) - REDUCED + current_date = datetime(2025, 8, 19) + years_back = (current_date - date).days / 365.25 + inflation_rate = np.random.uniform(0.015, 0.03) # Reduced from 0.02-0.04 + trend_mult = (1 + inflation_rate) ** years_back + mult *= trend_mult + + # Competitor effects (random promotions) - REDUCED PROBABILITY + if np.random.rand() < 0.05: # Reduced from 0.1 + competitor_effect = np.random.uniform(0.92, 0.98) # Reduced impact + mult *= competitor_effect + event_description.append(f"Competitor pressure: {competitor_effect:.2f}x") + + # CRITICAL FIX: Enforce realistic bounds + mult = np.clip(mult, 0.5, 2.5) # Prevent extreme multipliers + + # Calculate final prices + normal_price = base_price * mult + + # CRITICAL FIX: Ensure minimum price + normal_price = max(normal_price, base_price * 0.3) # Never less than 30% of base + + # Discount adjustments with realistic bounds + if mult > 1.2: + discount_multiplier = 1.1 # Reduced from 1.2 + elif mult < 0.9: + discount_multiplier = 0.9 # Reduced from 0.8 + else: + discount_multiplier = 1.0 + + adjusted_discount = base_discount_pct * discount_multiplier + adjusted_discount = max(0, min(0.7, adjusted_discount)) # Max 70% discount + + return normal_price, adjusted_discount, event_description + +# %% +# HISTORICAL DATA GENERATION +# ========================= + +def generate_initial_discounts(): + """Generate initial discount data (first code block functionality)""" + + # Load the main dataset + df = pd.read_csv('/content/coles_new.csv') + df.columns = df.columns.str.strip() + + # Load the fortnightly discounts dataset + discounts_df = pd.read_csv('/content/fortnightly_discounts.csv') + discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False) + + # Clean numeric columns - fix the data type issue + discount_columns = [col for col in discounts_df.columns if '-' in col] # Fortnight cols like Jan-01 + for col in discount_columns: + # Clean any problematic values like "25 25" or other string issues + discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False) + discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0) + + discounts_dict = discounts_df.set_index('Subcategory').to_dict('index') + + # Set the current date + current_date = datetime(2025, 8, 5) + df['date'] = current_date + + # Parse weights to grams + df['grams'] = df['weights'].apply(parse_to_grams) + + # Compute price per 100g + price_col = 'item_price' + if price_col in df.columns: + df['price_per_100g'] = df.apply( + lambda row: (row[price_col] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None, + axis=1 + ) + + # Get current fortnight column + fortnight_col = get_fortnight_col(current_date) + + # Initialize discount columns + df['promo_flag'] = 0 + df['discount_pct'] = 0.0 + df['discounted_price'] = df[price_col] + + # Apply discounts per item + for idx, row in df.iterrows(): + subcat = row['subcat'] + rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {})) + base_discount_pct = rule.get(fortnight_col, 0) / 100 + apply_prob = subcat_apply_probs.get(subcat, 0.2) + promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob) + + if promo_flag: + discount_pct = base_discount_pct + np.random.normal(0, 0.05) + discount_pct = np.clip(discount_pct, 0, 0.9) + df.at[idx, 'promo_flag'] = 1 + df.at[idx, 'discount_pct'] = discount_pct + df.at[idx, 'discounted_price'] = row[price_col] * (1 - discount_pct) + + # Save to new CSV + df.to_csv('coles_with_discounts.csv', index=False) + print("Initial discount data generated and saved to 'coles_with_discounts.csv'") + + return df + +def generate_enhanced_historical_data(): + """Main function to generate enhanced historical data""" + + # Load the main dataset + df = pd.read_csv('/content/coles_new.csv') + df.columns = df.columns.str.strip() + + # Load the fortnightly discounts dataset + discounts_df = pd.read_csv('/content/fortnightly_discounts.csv') + discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False) + + # Clean numeric columns - fix the data type issue + discount_columns = [col for col in discounts_df.columns if '-' in col] # Fortnight cols like Jan-01 + for col in discount_columns: + # Clean any problematic values like "25 25" or other string issues + discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False) + discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0) + + discounts_dict = discounts_df.set_index('Subcategory').to_dict('index') + + # Set the current date + current_date = datetime(2025, 8, 19) + + # Generate historical dates (104 weeks = 2 years) + dates = pd.date_range(end=current_date, periods=104, freq='W') + + # Parse weights and calculate price per 100g + df['grams'] = df['weights'].apply(parse_to_grams) + df['price_per_100g'] = df.apply( + lambda row: (row['item_price'] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None, + axis=1 + ) + + print(f"Generating enhanced historical data for {len(df)} products over {len(dates)} weeks...") + + all_historical_data = [] + + for idx, (_, row) in enumerate(df.iterrows()): + if idx % 1000 == 0: + print(f"Processing product {idx+1}/{len(df)}") + + subcat = row['subcat'] + category = row['category'] + base_price = row['item_price'] + + # Get discount rule for this subcat + rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {})) + apply_prob = subcat_apply_probs.get(subcat, 0.2) + + for date in dates: + # Get base discount for this fortnight + fortnight = get_fortnight_col(date) + base_discount_pct = float(rule.get(fortnight, 0)) / 100 + + # Apply enhanced factors + normal_price, discount_pct, events = apply_enhanced_factors( + date, category, subcat, base_price, base_discount_pct + ) + + # Determine if promotion is active + promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob) + if promo_flag: + discount_pct = discount_pct + np.random.normal(0, 0.05) + discount_pct = np.clip(discount_pct, 0, 0.9) + final_discount = discount_pct + else: + final_discount = 0 + + discounted_price = normal_price * (1 - final_discount) + + # Store the record + record = { + 'date': date, + 'product_code': row['product_code'], + 'category': category, + 'essential_flag': row['essential_flag'], + 'item_name': row['item_name'], + 'subcat': subcat, + 'weights': row['weights'], + 'unit_price': row.get('unit_price', 0), + 'brand_name': row['brand_name'], + 'grams': row['grams'], + 'price_per_100g': (discounted_price / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None, + 'normal_price': round(normal_price, 2), + 'promo_flag': int(promo_flag), + 'discount_pct': round(final_discount, 4), + 'discounted_price': round(discounted_price, 2), + 'price_multiplier': round(normal_price / base_price, 3), + 'events_active': '; '.join(events) if events else 'None' + } + + all_historical_data.append(record) + + # Convert to DataFrame + historical_df = pd.DataFrame(all_historical_data) + + print(f"Generated {len(historical_df):,} historical records") + return historical_df + +# %% +# SIMPLIFIED CATEGORY-BASED GENERATION (ALTERNATIVE APPROACH) +# ========================================================== + +def generate_synthetic_for_category(category_df, dates): + """Generate synthetic data for a specific category""" + synthetic_dfs = [] + + for _, row in category_df.iterrows(): + subcat = row['subcat'] + category = row['category'] + base_price = row['item_price'] + rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {})) + apply_prob = subcat_apply_probs.get(subcat, 0.2) + + item_df = pd.DataFrame({'date': dates}) + item_df['product_code'] = row['product_code'] + item_df['item_name'] = row['item_name'] + item_df['brand_name'] = row['brand_name'] + item_df['weights'] = row['weights'] + item_df['grams'] = row['grams'] + item_df['price_per_100g'] = row['price_per_100g'] + item_df['subcat'] = subcat + item_df['category'] = category + + item_df['normal_price'] = np.nan + item_df['discount_pct'] = 0.0 + item_df['promo_flag'] = 0 + item_df['discounted_price'] = np.nan + + for i, date in enumerate(dates): + fortnight = get_fortnight_col(date) + base_discount_pct = float(rule.get(fortnight, 0)) / 100 + normal_price, discount_pct = apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct)[:2] + + promo_flag = (discount_pct > 0) and (np.random.rand() < apply_prob) + item_df.at[i, 'normal_price'] = normal_price + item_df.at[i, 'promo_flag'] = 1 if promo_flag else 0 + item_df.at[i, 'discount_pct'] = discount_pct if promo_flag else 0 + item_df.at[i, 'discounted_price'] = normal_price * (1 - item_df.at[i, 'discount_pct']) + + synthetic_dfs.append(item_df) + return pd.concat(synthetic_dfs, ignore_index=True) + +# %% +# EXPLORATORY DATA ANALYSIS FUNCTIONS +# =================================== + +def create_essential_eda(df): + """Create 5 essential EDA plots""" + + print("Creating Essential EDA Plots...") + + # Setup modern style + sns.set_style("whitegrid") + plt.rcParams['figure.figsize'] = (12, 6) + plt.rcParams['font.size'] = 12 + + # 1. Time Series Analysis - Average Prices by Category + monthly_avg = df.groupby([df['date'].dt.to_period('M'), 'category'])['discounted_price'].mean().reset_index() + monthly_avg['date'] = monthly_avg['date'].dt.to_timestamp() + + fig1 = px.line( + monthly_avg, + x='date', + y='discounted_price', + color='category', + title='1. Average Monthly Prices by Category Over Time', + labels={'discounted_price': 'Average Price (AUD)', 'date': 'Date'} + ) + fig1.update_layout(height=600, hovermode='x unified') + fig1.show() + + # 2. Event Impact Analysis + event_impact = df.copy() + event_impact['has_events'] = event_impact['events_active'] != 'None' + comparison = event_impact.groupby(['category', 'has_events'])['price_multiplier'].mean().reset_index() + comparison['event_status'] = comparison['has_events'].map({True: 'With Events', False: 'Normal'}) + + fig2 = px.bar( + comparison, + x='category', + y='price_multiplier', + color='event_status', + title='2. Price Impact: Normal vs Event Periods', + labels={'price_multiplier': 'Average Price Multiplier'}, + barmode='group' + ) + fig2.update_layout(height=600) + fig2.update_xaxes(tickangle=45) + fig2.show() + + # 3. Seasonal Patterns Heatmap + df['month'] = df['date'].dt.month + seasonal_data = df.groupby(['category', 'month'])['discounted_price'].mean().reset_index() + seasonal_pivot = seasonal_data.pivot(index='category', columns='month', values='discounted_price') + + fig3 = go.Figure(data=go.Heatmap( + z=seasonal_pivot.values, + x=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], + y=seasonal_pivot.index, + colorscale='RdYlBu_r', + text=np.round(seasonal_pivot.values, 2), + texttemplate="%{text}", + textfont={"size": 10}, + hoverongaps=False + )) + fig3.update_layout( + title='3. Seasonal Price Patterns by Category', + height=600, + xaxis_title="Month", + yaxis_title="Category" + ) + fig3.show() + + # 4. Price Volatility by Category + volatility = df.groupby(['category', df['date'].dt.to_period('M')])['discounted_price'].agg(['mean', 'std']).reset_index() + volatility['date'] = volatility['date'].dt.to_timestamp() + volatility['cv'] = volatility['std'] / volatility['mean'] * 100 # Coefficient of variation + + fig4 = px.line( + volatility, + x='date', + y='cv', + color='category', + title='4. Price Volatility (Coefficient of Variation) by Category', + labels={'cv': 'Coefficient of Variation (%)', 'date': 'Date'} + ) + fig4.update_layout(height=600) + fig4.show() + + # 5. Distribution Overview + fig5 = make_subplots( + rows=2, cols=2, + subplot_titles=('Price Distribution', 'Discount Distribution', 'Promotion Rate by Category', 'Price Multiplier Distribution'), + specs=[[{'type': 'histogram'}, {'type': 'histogram'}], + [{'type': 'bar'}, {'type': 'histogram'}]] + ) + + # Price distribution + fig5.add_trace( + go.Histogram(x=df['discounted_price'], nbinsx=50, name='Price Distribution'), + row=1, col=1 + ) + + # Discount distribution (only when discount > 0) + discount_data = df[df['discount_pct'] > 0]['discount_pct'] * 100 + fig5.add_trace( + go.Histogram(x=discount_data, nbinsx=30, name='Discount Distribution'), + row=1, col=2 + ) + + # Promotion rates by category + promo_rates = df.groupby('category')['promo_flag'].mean() * 100 + fig5.add_trace( + go.Bar(x=promo_rates.index, y=promo_rates.values, name='Promotion Rate %'), + row=2, col=1 + ) + + # Price multiplier distribution + fig5.add_trace( + go.Histogram(x=df['price_multiplier'], nbinsx=50, name='Price Multiplier'), + row=2, col=2 + ) + + fig5.update_layout( + title="5. Data Distribution Overview", + height=800, + showlegend=False + ) + fig5.update_xaxes(tickangle=45, row=2, col=1) + fig5.show() + + # Summary Statistics + print("\nSUMMARY STATISTICS:") + print("="*50) + + for category in df['category'].unique(): + cat_data = df[df['category'] == category] + print(f"\n{category}:") + print(f" • Records: {len(cat_data):,}") + print(f" • Avg Price: ${cat_data['discounted_price'].mean():.2f}") + print(f" • Price Range: ${cat_data['discounted_price'].min():.2f} - ${cat_data['discounted_price'].max():.2f}") + print(f" • Promotion Rate: {cat_data['promo_flag'].mean()*100:.1f}%") + print(f" • Avg Discount: {cat_data['discount_pct'].mean()*100:.1f}%") + print(f" • Avg Price Multiplier: {cat_data['price_multiplier'].mean():.2f}x") + +def analyze_time_series_decomposition(df, category='Meat & Seafood'): + """Perform time series decomposition for a specific category""" + + print(f"\nTime Series Decomposition Analysis for {category}") + + # Aggregate data by date for the category + ts_data = df[df['category'] == category].groupby('date')['discounted_price'].mean() + ts_data = ts_data.asfreq('W', method='ffill') + + if len(ts_data) >= 52: # Need at least 1 year + try: + decomposition = seasonal_decompose(ts_data, model='multiplicative', period=52) + + fig = make_subplots( + rows=4, cols=1, + subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'), + vertical_spacing=0.08 + ) + + fig.add_trace(go.Scatter(x=ts_data.index, y=ts_data.values, + mode='lines', name='Original'), row=1, col=1) + fig.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend.values, + mode='lines', name='Trend'), row=2, col=1) + fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal.values, + mode='lines', name='Seasonal'), row=3, col=1) + fig.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid.values, + mode='lines', name='Residual'), row=4, col=1) + + fig.update_layout( + title=f'Time Series Decomposition - {category}', + height=1000, + showlegend=False + ) + fig.show() + + print(f"Decomposition completed for {category}") + return decomposition + + except Exception as e: + print(f"Decomposition failed: {e}") + return None + else: + print(f"Insufficient data for decomposition ({len(ts_data)} weeks)") + return None + +# %% +# MAIN EXECUTION FUNCTIONS +# ======================== + +def run_complete_analysis(): + """Run the complete enhanced historical data generation and analysis""" + + print("Starting Enhanced Historical Data Generation") + print("="*60) + + # Generate enhanced historical data + historical_df = generate_enhanced_historical_data() + + # Save to CSV + output_file = 'enhanced_historical_data.csv' + historical_df.to_csv(output_file, index=False) + print(f"Saved to {output_file}") + + # Create essential EDA plots + print("\nCreating Essential EDA Plots...") + create_essential_eda(historical_df) + + # Event analysis summary + event_records = historical_df[historical_df['events_active'] != 'None'] + if len(event_records) > 0: + print(f"\nEvent Impact Summary:") + print(f" • {len(event_records):,} records affected by events ({len(event_records)/len(historical_df)*100:.1f}%)") + + # Count events + all_events = [] + for events in event_records['events_active']: + all_events.extend([e.split(':')[0].strip() for e in events.split(';')]) + + from collections import Counter + event_counts = Counter(all_events) + print("\nTop 10 Most Frequent Events:") + for event, count in event_counts.most_common(10): + print(f" • {event}: {count:,} occurrences") + + print(f"\nAnalysis Complete!") + print(f"Generated {len(historical_df):,} records for ARIMA/LSTM modeling") + print(f"Date range: {historical_df['date'].min().date()} to {historical_df['date'].max().date()}") + + return historical_df + +def run_category_based_generation(): + """Alternative approach: Generate data by category""" + + # Load data + df = pd.read_csv('coles_with_discounts.csv') + df.columns = df.columns.str.strip() + + # Load discounts dict + discounts_df = pd.read_csv('/content/fortnightly_discounts.csv') + discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False) + + # Clean numeric columns - fix the data type issue + discount_columns = [col for col in discounts_df.columns if '-' in col] + for col in discount_columns: + # Clean any problematic values like "25 25" or other string issues + discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False) + discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0) + + global discounts_dict + discounts_dict = discounts_df.set_index('Subcategory').to_dict('index') + + # Current date and historical dates + current_date = datetime(2025, 8, 19) + dates = pd.date_range(end=current_date, periods=104, freq='W') + + # Unique categories from dataset + unique_categories = df['category'].unique() + + # Process categories + all_category_data = [] + for cat in unique_categories: + cat_df = df[df['category'] == cat] + if not cat_df.empty: + synth_cat = generate_synthetic_for_category(cat_df, dates) + synth_cat.to_csv(f'synthetic_{cat.replace(" ", "_").replace("&", "and")}.csv', index=False) + all_category_data.append(synth_cat) + print(f"Generated for {cat}: {len(synth_cat)} rows") + + # Combine all categories + full_synth = pd.concat(all_category_data, ignore_index=True) + full_synth.to_csv('synthetic_historical_data_v6.csv', index=False) + + print(f"Complete dataset saved: {len(full_synth)} records") + return full_synth + +# %% +# DATA INSPECTION AND EDA OUTPUT CELL +# =================================== + +def inspect_and_analyze_data(): + """ + Complete data inspection and EDA output to see how the data is performing + This function will show you everything about your dataset + """ + + print("=" * 80) + print("GROCERY PRICE ANALYSIS - COMPLETE DATA INSPECTION") + print("=" * 80) + + try: + # Try to load existing enhanced data first + print("\n1. LOADING DATA...") + try: + df = pd.read_csv('enhanced_historical_data.csv') + df['date'] = pd.to_datetime(df['date']) + print("✓ Loaded existing enhanced historical data") + data_source = "Enhanced Historical Data" + except FileNotFoundError: + try: + df = pd.read_csv('synthetic_historical_data_v6.csv') + df['date'] = pd.to_datetime(df['date']) + print("✓ Loaded existing synthetic data") + data_source = "Synthetic Historical Data" + except FileNotFoundError: + print("⚠ No existing data found. Generating new data...") + df = run_complete_analysis() + data_source = "Newly Generated Data" + + print(f"Data Source: {data_source}") + print(f"Data Shape: {df.shape[0]:,} rows × {df.shape[1]} columns") + + # 2. DATA QUALITY ASSESSMENT + print("\n" + "=" * 80) + print("2. DATA QUALITY ASSESSMENT") + print("=" * 80) + + print(f"\nDATE RANGE:") + print(f" • Start Date: {df['date'].min().strftime('%Y-%m-%d')}") + print(f" • End Date: {df['date'].max().strftime('%Y-%m-%d')}") + print(f" • Total Weeks: {df['date'].nunique()}") + print(f" • Date Coverage: {(df['date'].max() - df['date'].min()).days} days") + + print(f"\nMISSING VALUES:") + missing = df.isnull().sum() + missing_pct = (missing / len(df) * 100).round(2) + for col in df.columns: + if missing[col] > 0: + print(f" • {col}: {missing[col]:,} ({missing_pct[col]}%)") + + print(f"\nDATA TYPES:") + for col, dtype in df.dtypes.items(): + unique_count = df[col].nunique() + print(f" • {col}: {dtype} ({unique_count:,} unique values)") + + print(f"\nCATEGORY BREAKDOWN:") + category_counts = df['category'].value_counts() + for cat, count in category_counts.items(): + pct = (count / len(df) * 100) + print(f" • {cat}: {count:,} records ({pct:.1f}%)") + + # 3. PRICE ANALYSIS + print("\n" + "=" * 80) + print("3. PRICE ANALYSIS") + print("=" * 80) + + print(f"\nOVERALL PRICE STATISTICS:") + price_stats = df['discounted_price'].describe() + print(f" • Mean Price: ${price_stats['mean']:.2f}") + print(f" • Median Price: ${price_stats['50%']:.2f}") + print(f" • Price Range: ${price_stats['min']:.2f} - ${price_stats['max']:.2f}") + print(f" • Standard Deviation: ${price_stats['std']:.2f}") + print(f" • 25th Percentile: ${price_stats['25%']:.2f}") + print(f" • 75th Percentile: ${price_stats['75%']:.2f}") + + print(f"\nPRICE BY CATEGORY:") + for category in df['category'].unique(): + cat_data = df[df['category'] == category]['discounted_price'] + print(f" • {category}:") + print(f" - Mean: ${cat_data.mean():.2f}") + print(f" - Median: ${cat_data.median():.2f}") + print(f" - Range: ${cat_data.min():.2f} - ${cat_data.max():.2f}") + + # 4. PROMOTION AND DISCOUNT ANALYSIS + print("\n" + "=" * 80) + print("4. PROMOTION AND DISCOUNT ANALYSIS") + print("=" * 80) + + overall_promo_rate = df['promo_flag'].mean() * 100 + print(f"\nOVERALL PROMOTION RATE: {overall_promo_rate:.1f}%") + + promoted_items = df[df['promo_flag'] == 1] + if len(promoted_items) > 0: + avg_discount = promoted_items['discount_pct'].mean() * 100 + print(f"AVERAGE DISCOUNT (when promoted): {avg_discount:.1f}%") + + print(f"\nPROMOTION RATES BY CATEGORY:") + for category in df['category'].unique(): + cat_promo_rate = df[df['category'] == category]['promo_flag'].mean() * 100 + cat_avg_discount = df[(df['category'] == category) & (df['promo_flag'] == 1)]['discount_pct'].mean() * 100 + print(f" • {category}: {cat_promo_rate:.1f}% promo rate, {cat_avg_discount:.1f}% avg discount") + + # 5. EVENT IMPACT ANALYSIS + print("\n" + "=" * 80) + print("5. EVENT IMPACT ANALYSIS") + print("=" * 80) + + if 'events_active' in df.columns: + event_records = df[df['events_active'] != 'None'] + event_impact_rate = len(event_records) / len(df) * 100 + print(f"\nEVENT IMPACT RATE: {event_impact_rate:.1f}% of records affected by events") + + if len(event_records) > 0: + # Event frequency analysis + all_events = [] + for events in event_records['events_active']: + if pd.notna(events) and events != 'None': + all_events.extend([e.split(':')[0].strip() for e in str(events).split(';')]) + + if all_events: + from collections import Counter + event_counts = Counter(all_events) + print(f"\nTOP 10 MOST FREQUENT EVENTS:") + for event, count in event_counts.most_common(10): + pct = (count / len(df) * 100) + print(f" • {event}: {count:,} occurrences ({pct:.2f}%)") + + # Price impact of events + normal_prices = df[df['events_active'] == 'None']['discounted_price'].mean() + event_prices = event_records['discounted_price'].mean() + price_impact = ((event_prices - normal_prices) / normal_prices * 100) + print(f"\nPRICE IMPACT OF EVENTS:") + print(f" • Normal periods avg price: ${normal_prices:.2f}") + print(f" • Event periods avg price: ${event_prices:.2f}") + print(f" • Price increase during events: {price_impact:.1f}%") + + # 6. TEMPORAL PATTERNS + print("\n" + "=" * 80) + print("6. TEMPORAL PATTERNS") + print("=" * 80) + + df['month'] = df['date'].dt.month + df['year'] = df['date'].dt.year + df['quarter'] = df['date'].dt.quarter + + print(f"\nMONTHLY PRICE PATTERNS:") + monthly_prices = df.groupby('month')['discounted_price'].mean() + month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + for month, price in monthly_prices.items(): + print(f" • {month_names[month-1]}: ${price:.2f}") + + print(f"\nQUARTERLY TRENDS:") + quarterly_prices = df.groupby('quarter')['discounted_price'].mean() + for quarter, price in quarterly_prices.items(): + print(f" • Q{quarter}: ${price:.2f}") + + # 7. CREATE ALL VISUALIZATIONS + print("\n" + "=" * 80) + print("7. CREATING VISUALIZATIONS") + print("=" * 80) + + create_essential_eda(df) + + # 8. ADDITIONAL INSIGHTS + print("\n" + "=" * 80) + print("8. KEY INSIGHTS AND RECOMMENDATIONS") + print("=" * 80) + + # Price volatility analysis + price_volatility = df.groupby('category')['discounted_price'].std().sort_values(ascending=False) + print(f"\nMOST VOLATILE CATEGORIES (by price std dev):") + for category, volatility in price_volatility.head().items(): + print(f" • {category}: ${volatility:.2f} std dev") + + # Best promotion opportunities + low_promo_categories = df.groupby('category')['promo_flag'].mean().sort_values().head() + print(f"\nCATEGORIES WITH LOWEST PROMOTION RATES (opportunities):") + for category, rate in low_promo_categories.items(): + print(f" • {category}: {rate*100:.1f}% promotion rate") + + # Seasonal opportunities + summer_winter_diff = df[df['month'].isin([12, 1, 2])]['discounted_price'].mean() - df[df['month'].isin([6, 7, 8])]['discounted_price'].mean() + print(f"\nSEASONAL PRICE DIFFERENCE:") + print(f" • Summer vs Winter avg price difference: ${summer_winter_diff:.2f}") + + print("\n" + "=" * 80) + print("9. DATA QUALITY SUMMARY") + print("=" * 80) + + # Data completeness score + completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100 + print(f"\nDATA COMPLETENESS: {completeness:.1f}%") + + # Data distribution health + price_outliers = len(df[df['discounted_price'] > df['discounted_price'].quantile(0.99)]) + print(f"PRICE OUTLIERS (>99th percentile): {price_outliers:,} records ({price_outliers/len(df)*100:.2f}%)") + + # Time series regularity + date_gaps = df['date'].drop_duplicates().sort_values().diff().dropna() + regular_intervals = (date_gaps == pd.Timedelta(days=7)).mean() * 100 + print(f"TIME SERIES REGULARITY: {regular_intervals:.1f}% regular weekly intervals") + + print(f"\n" + "=" * 80) + print("ANALYSIS COMPLETE!") + print("=" * 80) + print(f"Dataset is ready for ARIMA/LSTM modeling") + print(f"Key features available: prices, promotions, events, seasonality") + print(f"Recommended next steps:") + print(f" 1. Use price_multiplier and events_active as external regressors") + print(f" 2. Consider category-specific models due to different volatility patterns") + print(f" 3. Leverage seasonal decomposition for feature engineering") + + return df + + except Exception as e: + print(f" Error during analysis: {str(e)}") + import traceback + traceback.print_exc() + return None + +# %% +# RUN COMPLETE INSPECTION AND ANALYSIS +# ==================================== + +# Execute this cell to see all data performance metrics and visualizations +df_analyzed = inspect_and_analyze_data() + +# If you want to run time series decomposition on specific categories: +if df_analyzed is not None and len(df_analyzed) > 0: + print("\n" + "="*80) + print("BONUS: TIME SERIES DECOMPOSITION") + print("="*80) + + # Run decomposition for top 3 categories + top_categories = df_analyzed['category'].value_counts().head(3).index + + for category in top_categories: + print(f"\nAnalyzing {category}...") + decomp_result = analyze_time_series_decomposition(df_analyzed, category) + if decomp_result is not None: + print(f"✓ Decomposition completed for {category}") + else: + print(f"⚠ Could not decompose {category} - insufficient data") + +print("\n🎉 Complete analysis finished! All outputs are displayed above.") + +# %% +# EXECUTION INSTRUCTIONS +# ====================== + +print("Code loaded and ready to run!") +print("\nMain execution cell above will show you:") +print(" • Complete data quality assessment") +print(" • Price analysis by category") +print(" • Promotion and discount patterns") +print(" • Event impact analysis") +print(" • Temporal patterns and seasonality") +print(" • All EDA visualizations") +print(" • Key insights and recommendations") +print("\nAlternative execution options:") +print("1. historical_data = run_complete_analysis()") +print("2. category_data = run_category_based_generation()") +print("3. initial_data = generate_initial_discounts()") + +# Enhanced Grocery Price Analysis with Seasonal and Event-Based Modeling +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.17.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 + +# %% +import pandas as pd +import numpy as np +import re +from datetime import datetime, timedelta +import matplotlib.pyplot as plt +import seaborn as sns +import plotly.express as px +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from statsmodels.tsa.seasonal import seasonal_decompose +import warnings +warnings.filterwarnings('ignore') + +# %% +# CONFIGURATION AND DATA STRUCTURES +# ================================= + +# Function to parse weights to grams +def parse_to_grams(s): + """Convert weight strings to grams for standardization""" + if pd.isna(s): + return None + + s = str(s).strip().lower() + match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', s) + + if not match: + if 'pack' in s or 'each' in s: + return 1 + return None + + num_str = match.group(1) + unit = match.group(2).lower() + + try: + num = float(num_str) + except ValueError: + if 'pack' in s or 'each' in s: + return 1 + return None + + # Unit conversions + if unit in ['g', 'gram', 'grams']: + return num + elif unit in ['kg', 'kilogram', 'kilograms']: + return num * 1000 + elif unit in ['ml', 'milliliter', 'milliliters']: + return num # assume 1g/ml + elif unit in ['l', 'liter', 'liters']: + return num * 1000 + elif unit in ['pack', 'each']: + return num + else: + return None + +# %% +# HOLIDAY AND EVENT DEFINITIONS +# ============================ + +# Major holidays affecting grocery pricing +holidays = { + 'New Year': [ + datetime(2023, 1, 1), datetime(2024, 1, 1), datetime(2025, 1, 1) + ], + 'Australia Day': [ + datetime(2023, 1, 26), datetime(2024, 1, 26), datetime(2025, 1, 26) + ], + 'Easter': [ + datetime(2023, 4, 7), datetime(2023, 4, 10), + datetime(2024, 3, 29), datetime(2024, 4, 1), + datetime(2025, 4, 18), datetime(2025, 4, 21) + ], + 'Anzac Day': [ + datetime(2023, 4, 25), datetime(2024, 4, 25), datetime(2025, 4, 25) + ], + 'Christmas': [ + datetime(2023, 12, 25), datetime(2023, 12, 26), + datetime(2024, 12, 25), datetime(2024, 12, 26), + datetime(2025, 12, 25), datetime(2025, 12, 26) + ], + 'Chinese New Year': [ + datetime(2023, 2, 22), datetime(2024, 2, 10), datetime(2025, 1, 29) + ], + 'Mother Day': [ + datetime(2023, 5, 14), datetime(2024, 5, 12), datetime(2025, 5, 11) + ], + 'Father Day': [ + datetime(2023, 9, 3), datetime(2024, 9, 1), datetime(2025, 9, 7) + ], + 'Labour Day': [ + datetime(2023, 10, 2), datetime(2024, 10, 7), datetime(2025, 10, 6) + ], + 'King Birthday': [ + datetime(2023, 6, 12), datetime(2024, 6, 10), datetime(2025, 6, 9) + ], + 'Melbourne Cup': [ + datetime(2023, 11, 7), datetime(2024, 11, 5), datetime(2025, 11, 4) + ], + 'Diwali': [ + datetime(2023, 11, 12), datetime(2024, 11, 1), datetime(2025, 10, 20) + ], + 'Halloween': [ + datetime(2023, 10, 31), datetime(2024, 10, 31), datetime(2025, 10, 31) + ], + 'Valentine Day': [ + datetime(2023, 2, 14), datetime(2024, 2, 14), datetime(2025, 2, 14) + ], + 'Back to School': [ + datetime(2023, 1, 30), datetime(2023, 7, 24), + datetime(2024, 1, 29), datetime(2024, 7, 22), + datetime(2025, 1, 27), datetime(2025, 7, 21) + ], + 'School Holidays': [ + datetime(2023, 4, 10), datetime(2023, 7, 1), datetime(2023, 9, 25), datetime(2023, 12, 18), + datetime(2024, 3, 28), datetime(2024, 7, 6), datetime(2024, 9, 23), datetime(2024, 12, 16), + datetime(2025, 4, 14), datetime(2025, 7, 5), datetime(2025, 9, 22), datetime(2025, 12, 15) + ] +} + +# Supply chain disruption events +supply_chain_events = { + 'Suez Canal': [datetime(2023, 3, 25), datetime(2023, 3, 29)], + 'Shipping Delays': [datetime(2023, 8, 15), datetime(2023, 8, 25)], + 'Port Strikes': [datetime(2024, 2, 10), datetime(2024, 2, 20)], + 'Fuel Price Spike': [datetime(2024, 9, 1), datetime(2024, 9, 15)], + 'Container Shortage': [datetime(2023, 11, 1), datetime(2023, 11, 30)], + 'COVID Lockdown': [datetime(2023, 5, 1), datetime(2023, 5, 14)], + 'Truck Driver Strike': [datetime(2024, 6, 15), datetime(2024, 6, 25)], + 'Factory Fire': [datetime(2024, 11, 10), datetime(2024, 11, 20)] +} + +# Weather events affecting agriculture and supply +weather_events = { + 'Flood Queensland': [datetime(2023, 2, 15), datetime(2023, 3, 15)], + 'Drought NSW': [datetime(2023, 6, 1), datetime(2023, 8, 31)], + 'Cyclone WA': [datetime(2024, 1, 20), datetime(2024, 2, 5)], + 'Heatwave Victoria': [datetime(2024, 12, 15), datetime(2025, 1, 15)], + 'Frost Tasmania': [datetime(2023, 9, 1), datetime(2023, 9, 30)], + 'Bushfire NSW': [datetime(2024, 10, 1), datetime(2024, 10, 31)], + 'Heavy Rain Melbourne': [datetime(2024, 3, 10), datetime(2024, 3, 25)], + 'Extreme Heat Adelaide': [datetime(2025, 2, 1), datetime(2025, 2, 14)] +} + +# Disease/pest outbreaks +disease_events = { + 'Avian Flu': [datetime(2023, 7, 1), datetime(2023, 9, 30)], + 'Foot and Mouth Scare': [datetime(2024, 4, 1), datetime(2024, 4, 30)], + 'White Spot Prawns': [datetime(2023, 10, 15), datetime(2023, 11, 15)], + 'Banana Disease': [datetime(2024, 8, 1), datetime(2024, 9, 15)], + 'Citrus Canker': [datetime(2025, 3, 1), datetime(2025, 4, 15)] +} + +# %% +# PRICING MULTIPLIERS BY CATEGORY +# =============================== + +# FIXED: Realistic category multipliers (reduced by 50-70%) +category_multipliers = { + 'Meat & Seafood': { + 'Christmas': 1.3, 'Easter': 1.2, 'Summer': 1.1, 'Winter': 0.95, + 'Avian Flu': 1.25, 'Foot and Mouth Scare': 1.15, 'White Spot Prawns': 1.2, + 'Drought NSW': 1.1, 'Flood Queensland': 1.08, + 'supply_chain_base': 1.15, 'weather_base': 1.1, 'disease_base': 1.2, + 'shock_prob': 0.08, 'shock_var': 0.15 + }, + 'Fruit & Vegetables': { + 'Summer': 0.85, 'Winter': 1.2, 'Christmas': 1.1, 'Chinese New Year': 1.15, + 'Flood Queensland': 1.4, 'Drought NSW': 1.3, 'Cyclone WA': 1.25, + 'Heatwave Victoria': 1.15, 'Frost Tasmania': 1.2, 'Heavy Rain Melbourne': 1.1, + 'Banana Disease': 1.5, 'Citrus Canker': 1.4, + 'supply_chain_base': 1.08, 'weather_base': 1.3, 'disease_base': 1.4, + 'shock_prob': 0.12, 'shock_var': 0.25 + }, + 'Dairy, Eggs & Fridge': { + 'Christmas': 1.15, 'Easter': 1.1, 'Winter': 1.05, 'Back to School': 1.08, + 'Avian Flu': 1.3, 'Drought NSW': 1.15, 'Extreme Heat Adelaide': 1.1, + 'supply_chain_base': 1.1, 'weather_base': 1.08, 'disease_base': 1.25, + 'shock_prob': 0.08, 'shock_var': 0.15 + }, + 'Bakery': { + 'Christmas': 1.2, 'Easter': 1.1, 'School Holidays': 1.08, 'Winter': 1.03, + 'supply_chain_base': 1.05, 'weather_base': 1.03, + 'shock_prob': 0.05, 'shock_var': 0.08 + }, + 'Pantry': { + 'COVID Lockdown': 1.25, 'School Holidays': 1.1, 'Back to School': 1.15, + 'Container Shortage': 1.1, 'Shipping Delays': 1.08, + 'supply_chain_base': 1.08, 'weather_base': 1.03, + 'shock_prob': 0.05, 'shock_var': 0.1 + }, + 'Health & Beauty': { + 'New Year': 1.25, 'Valentine Day': 1.1, 'Winter': 1.08, + 'supply_chain_base': 1.03, 'shock_prob': 0.03, 'shock_var': 0.05 + }, + 'Drinks': { + 'Summer': 1.3, 'Christmas': 1.35, 'Australia Day': 1.15, 'Melbourne Cup': 1.1, + 'Heatwave Victoria': 1.2, 'Extreme Heat Adelaide': 1.18, + 'supply_chain_base': 1.08, 'weather_base': 1.1, + 'shock_prob': 0.05, 'shock_var': 0.1 + }, + 'Frozen': { + 'Summer': 1.5, 'Heatwave Victoria': 1.3, 'Extreme Heat Adelaide': 1.25, + 'Christmas': 1.3, 'School Holidays': 1.2, + 'supply_chain_base': 1.1, 'weather_base': 1.25, + 'shock_prob': 0.08, 'shock_var': 0.15 + }, + 'Deli': { + 'Christmas': 1.4, 'Easter': 1.25, 'Melbourne Cup': 1.18, 'King Birthday': 1.1, + 'supply_chain_base': 1.08, 'shock_prob': 0.06, 'shock_var': 0.12 + }, + 'Household': { + 'Back to School': 1.15, 'Spring': 1.1, 'COVID Lockdown': 1.2, + 'supply_chain_base': 1.05, 'shock_prob': 0.04, 'shock_var': 0.08 + } +} + +# %% +# SUBCATEGORY-SPECIFIC SEASONAL EFFECTS +# ==================================== + +# FIXED: Realistic subcategory seasonal effects (reduced) +subcat_seasonal_effects = { + 'Fruit': {'Summer': 0.7, 'Winter': 1.4}, # Reduced from 0.4/2.5 + 'Vegetables (Leafy/Salad)': {'Summer': 1.15, 'Winter': 0.95}, # Reduced from 1.4/0.9 + 'Vegetables (Root/Onion/Garlic)': {'Winter': 0.9, 'Summer': 1.08}, # Reduced from 0.8/1.2 + 'Vegetables (Fruiting)': {'Summer': 0.8, 'Winter': 1.2}, # Reduced from 0.6/1.5 + 'Lamb': {'Easter': 1.4, 'Christmas': 1.25}, # Reduced from 2.0/1.6 + 'Turkey': {'Christmas': 1.8, 'Easter': 1.05}, # Reduced from 3.0/1.2 + 'Fish': {'Christmas': 1.5, 'Easter': 1.3, 'Summer': 1.1}, # Reduced from 2.4/1.8/1.3 + 'Prawns': {'Christmas': 2.0, 'Chinese New Year': 1.6}, # Reduced from 3.5/2.2 + 'Ice Cream': {'Summer': 2.2, 'Heatwave Victoria': 1.8, 'Winter': 0.5}, # Reduced from 4.0/3.0/0.3 + 'Frozen Vegetables': {'Winter': 1.1, 'COVID Lockdown': 1.2} # Reduced from 1.3/1.5 +} + +# %% +# SUBCATEGORY PROMOTION PROBABILITIES +# ================================== + +# Dictionary of subcat apply probabilities +subcat_apply_probs = { + 'Pork': 0.4, 'Beef': 0.4, 'Chicken': 0.4, 'Prawns': 0.35, 'Pantry/Other': 0.15, + 'Lamb': 0.35, 'Mixed Meat': 0.3, 'Salmon': 0.35, 'Fish': 0.35, 'Turkey': 0.3, + 'Tuna': 0.3, 'Kangaroo': 0.3, 'Seafood': 0.35, 'Plant-Based': 0.25, 'Veal': 0.3, + 'Duck': 0.3, 'Trout': 0.35, 'Mussels': 0.35, 'Venison': 0.3, 'Wallaby': 0.3, + 'Crab': 0.35, 'Fruit': 0.25, 'Other Items (F&V Section)': 0.2, + 'Vegetables (Fruiting)': 0.25, 'Vegetables (Root/Onion/Garlic)': 0.2, + 'Vegetables (Stem/Flower/Pod)': 0.2, 'Vegetables (Leafy/Salad)': 0.25, + 'Mushrooms': 0.25, 'Herbs/Sprouts': 0.2, 'Value-Added Produce': 0.2, + 'Nuts/Seeds/Dried Fruit': 0.25, 'Yoghurt Specialty': 0.25, 'Butter Standard': 0.2, + 'Milk Specialty': 0.2, 'Cheese Standard': 0.2, 'Cheese Specialty': 0.25, + 'Butter Specialty': 0.2, 'Yoghurt Standard': 0.2, 'Outsider': 0.15, + 'Cream Standard': 0.2, 'Milk Standard': 0.15, 'Eggs Standard': 0.2, + 'Bread Loaves': 0.15, 'Wraps & Flatbreads': 0.2, 'Cakes & Slices': 0.3, + 'Rolls & Buns': 0.2, 'Savoury Bakery Items': 0.25, 'Sourdough & Artisan Breads': 0.25, + 'Sweet Pastries & Donuts': 0.3, 'Biscuits & Cookies': 0.3, 'Pancakes, Waffles & Crepes': 0.25, + 'Muffins & Cupcakes': 0.3, 'Seafood (Processed/Cooked)': 0.35, 'Bacon': 0.35, + 'Ham': 0.35, 'Platters/Kits': 0.3, 'Chicken (Processed/Cooked)': 0.35, 'Pantry': 0.15, + 'Salami/Pepperoni/Chorizo': 0.25, 'Crackers/Breadsticks': 0.25, 'Antipasto/Olives/Pickles': 0.25, + 'Turkey (Processed/Cooked)': 0.3, 'Beef (Processed/Cooked)': 0.35, 'Frankfurts/Sausages': 0.3, + 'Pork (Processed/Cooked)': 0.35, 'Cheese': 0.2, 'Bakery': 0.25, 'Confectionery': 0.3, + 'Other Deli': 0.2, 'Prepared Meals': 0.25, 'Dips/Pate': 0.25, 'Snacks (Sweet)': 0.3, + 'Canned Goods': 0.15, 'Meal Kits/Bases/Instant Meals': 0.25, 'Breakfast Cereals': 0.25, + 'Pasta/Rice/Noodles/Grains': 0.15, 'Snacks (Savoury)': 0.3, 'Beverages (Shelf-Stable)': 0.25, + 'Spreads/Oils/Condiments': 0.2, 'Baking Mixes': 0.2, 'Baking Ingredients': 0.2, + 'Other Pantry Items': 0.15, 'Juice/Smoothie': 0.25, 'Functional/Health Drink': 0.25, + 'Other Drinks': 0.2, 'Soft Drink/Mixer': 0.3, 'Water': 0.15, 'Milk': 0.15, + 'Non-Drink Item': 0.15, 'Tea': 0.2, 'Coffee': 0.25, 'Alcoholic Beverages (Low/No Alc)': 0.3, + 'Frozen Chips': 0.3, 'Ice Cream': 0.3, 'Frozen Desserts': 0.3, 'Frozen Meat': 0.3, + 'Frozen Poultry': 0.3, 'Frozen Fruits': 0.25, 'Frozen Vegetables': 0.25, + 'Frozen Pastry': 0.25, 'Frozen Meals': 0.25, 'Frozen Seafood': 0.3, 'Other Frozen': 0.2, + 'Stationery': 0.1, 'Dishwashing': 0.15, 'Bags': 0.15, 'Laundry Care': 0.2, + 'Kitchenware & Food Storage': 0.2, 'Paper Products': 0.15, 'Cleaning Solutions & Wipes': 0.2, + 'Cleaning Tools & Accessories': 0.15, 'Home Maintenance & General': 0.15, + 'Air Care & Pest Control': 0.2, 'Vitamins & Supplements': 0.2, 'Skincare': 0.25, + 'Wash Products': 0.2, 'First Aid & Wellness': 0.15, 'Health & Medicines': 0.15, + 'Feminine & Incontinence Care': 0.15, 'Oral Care': 0.2, 'Deodorants & Antiperspirants': 0.25, + 'Hair Care': 0.2, 'Shaving & Hair Removal': 0.2, 'First Aid & Wellness Accessories': 0.15, + 'Deodorants & Body Sprays': 0.25, 'Medicines & Health Treatments': 0.15, + "Shaving & Men's Grooming": 0.2, +} + +# %% +# CORE PRICING AND ANALYSIS FUNCTIONS +# =================================== + +def get_fortnight_col(date): + """Get fortnight column name for discount lookup""" + month_abbr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + month_idx = date.month - 1 + month_name = month_abbr[month_idx] + half = '01' if date.day <= 15 else '02' + return f"{month_name}-{half}" + +def get_season(date): + """Get season for Southern Hemisphere""" + month = date.month + if month in [12, 1, 2]: + return 'Summer' + elif month in [3, 4, 5]: + return 'Autumn' + elif month in [6, 7, 8]: + return 'Winter' + else: + return 'Spring' + +def is_event_active(date, event_dates, days_impact=7): + """Check if date is within impact period of an event""" + for event_date in event_dates: + if isinstance(event_date, list): + if len(event_date) == 2: + start_date, end_date = event_date + if start_date <= date <= end_date + timedelta(days=days_impact): + return True + else: + if abs((date - event_date).days) <= days_impact: + return True + return False + +def apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct): + """Apply comprehensive pricing factors including all real-world events - FIXED VERSION""" + multipliers = category_multipliers.get(category, {}) + subcat_effects = subcat_seasonal_effects.get(subcat, {}) + + mult = 1.0 + event_description = [] + + # Seasonal effects + season = get_season(date) + if season in multipliers: + seasonal_mult = multipliers[season] + mult *= seasonal_mult + if seasonal_mult != 1.0: + event_description.append(f"{season} effect: {seasonal_mult:.2f}x") + + # Subcat seasonal effects (override category if more specific) + if season in subcat_effects: + subcat_mult = subcat_effects[season] + mult = mult / multipliers.get(season, 1.0) * subcat_mult + event_description.append(f"{subcat} {season}: {subcat_mult:.2f}x") + + # Holiday effects - REDUCED PROBABILITY + for holiday, dates in holidays.items(): + if is_event_active(date, dates, days_impact=14): + # Only apply if random chance (reduce from 100% to 40%) + if np.random.rand() < 0.4: + if holiday in multipliers: + holiday_mult = multipliers[holiday] + mult *= holiday_mult + event_description.append(f"{holiday}: {holiday_mult:.2f}x") + elif holiday in subcat_effects: + holiday_mult = subcat_effects[holiday] + mult *= holiday_mult + event_description.append(f"{subcat} {holiday}: {holiday_mult:.2f}x") + + # Supply chain disruptions - REDUCED PROBABILITY + for event, dates in supply_chain_events.items(): + if is_event_active(date, dates, days_impact=21): + # Only apply if random chance (reduce from 100% to 20%) + if np.random.rand() < 0.2: + supply_mult = multipliers.get('supply_chain_base', 1.08) + if event in multipliers: + supply_mult = multipliers[event] + mult *= supply_mult + event_description.append(f"Supply chain ({event}): {supply_mult:.2f}x") + + # Weather events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY + for event, dates in weather_events.items(): + if is_event_active(date, dates, days_impact=30): + # Only apply to relevant categories and reduce probability + weather_categories = ['Fruit & Vegetables', 'Meat & Seafood', 'Dairy, Eggs & Fridge'] + if category in weather_categories and np.random.rand() < 0.25: + weather_mult = multipliers.get('weather_base', 1.1) + if event in multipliers: + weather_mult = multipliers[event] + mult *= weather_mult + event_description.append(f"Weather ({event}): {weather_mult:.2f}x") + + # Disease/pest events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY + for event, dates in disease_events.items(): + if is_event_active(date, dates, days_impact=60): + # Only apply to relevant categories + disease_categories = { + 'Avian Flu': ['Meat & Seafood', 'Dairy, Eggs & Fridge'], + 'Foot and Mouth Scare': ['Meat & Seafood', 'Dairy, Eggs & Fridge'], + 'White Spot Prawns': ['Meat & Seafood'], + 'Banana Disease': ['Fruit & Vegetables'], + 'Citrus Canker': ['Fruit & Vegetables'] + } + relevant_categories = disease_categories.get(event, []) + if category in relevant_categories and np.random.rand() < 0.3: + disease_mult = multipliers.get('disease_base', 1.2) + if event in multipliers: + disease_mult = multipliers[event] + mult *= disease_mult + event_description.append(f"Disease ({event}): {disease_mult:.2f}x") + + # Random market shocks - REDUCED PROBABILITY + shock_prob = multipliers.get('shock_prob', 0.05) + if np.random.rand() < shock_prob: + shock = np.random.normal(0, multipliers.get('shock_var', 0.1)) + shock_mult = 1 + shock + mult *= shock_mult + if abs(shock) > 0.05: # Only log significant shocks + event_description.append(f"Market shock: {shock_mult:.2f}x") + + # Inflation trend (1.5-3% annually) - REDUCED + current_date = datetime(2025, 8, 19) + years_back = (current_date - date).days / 365.25 + inflation_rate = np.random.uniform(0.015, 0.03) # Reduced from 0.02-0.04 + trend_mult = (1 + inflation_rate) ** years_back + mult *= trend_mult + + # Competitor effects (random promotions) - REDUCED PROBABILITY + if np.random.rand() < 0.05: # Reduced from 0.1 + competitor_effect = np.random.uniform(0.92, 0.98) # Reduced impact + mult *= competitor_effect + event_description.append(f"Competitor pressure: {competitor_effect:.2f}x") + + # CRITICAL FIX: Enforce realistic bounds + mult = np.clip(mult, 0.5, 2.5) # Prevent extreme multipliers + + # Calculate final prices + normal_price = base_price * mult + + # CRITICAL FIX: Ensure minimum price + normal_price = max(normal_price, base_price * 0.3) # Never less than 30% of base + + # Discount adjustments with realistic bounds + if mult > 1.2: + discount_multiplier = 1.1 # Reduced from 1.2 + elif mult < 0.9: + discount_multiplier = 0.9 # Reduced from 0.8 + else: + discount_multiplier = 1.0 + + adjusted_discount = base_discount_pct * discount_multiplier + adjusted_discount = max(0, min(0.7, adjusted_discount)) # Max 70% discount + + return normal_price, adjusted_discount, event_description + +# %% +# HISTORICAL DATA GENERATION +# ========================= + +def generate_initial_discounts(): + """Generate initial discount data (first code block functionality)""" + + # Load the main dataset + df = pd.read_csv('/Users/rajpatel/Desktop/coles_new.csv') + df.columns = df.columns.str.strip() + + # Load the fortnightly discounts dataset + discounts_df = pd.read_csv('fortnightly_discounts.csv') + discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False) + + # Clean numeric columns - fix the data type issue + discount_columns = [col for col in discounts_df.columns if '-' in col] # Fortnight cols like Jan-01 + for col in discount_columns: + # Clean any problematic values like "25 25" or other string issues + discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False) + discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0) + + discounts_dict = discounts_df.set_index('Subcategory').to_dict('index') + + # Set the current date + current_date = datetime(2025, 8, 5) + df['date'] = current_date + + # Parse weights to grams + df['grams'] = df['weights'].apply(parse_to_grams) + + # Compute price per 100g + price_col = 'item_price' + if price_col in df.columns: + df['price_per_100g'] = df.apply( + lambda row: (row[price_col] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None, + axis=1 + ) + + # Get current fortnight column + fortnight_col = get_fortnight_col(current_date) + + # Initialize discount columns + df['promo_flag'] = 0 + df['discount_pct'] = 0.0 + df['discounted_price'] = df[price_col] + + # Apply discounts per item + for idx, row in df.iterrows(): + subcat = row['subcat'] + rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {})) + base_discount_pct = rule.get(fortnight_col, 0) / 100 + apply_prob = subcat_apply_probs.get(subcat, 0.2) + promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob) + + if promo_flag: + discount_pct = base_discount_pct + np.random.normal(0, 0.05) + discount_pct = np.clip(discount_pct, 0, 0.9) + df.at[idx, 'promo_flag'] = 1 + df.at[idx, 'discount_pct'] = discount_pct + df.at[idx, 'discounted_price'] = row[price_col] * (1 - discount_pct) + + # Save to new CSV + df.to_csv('coles_with_discounts.csv', index=False) + print("Initial discount data generated and saved to 'coles_with_discounts.csv'") + + return df + +def generate_enhanced_historical_data(): + """Main function to generate enhanced historical data""" + + # Load the main dataset + df = pd.read_csv('/Users/rajpatel/Desktop/coles_new.csv') + df.columns = df.columns.str.strip() + + # Load the fortnightly discounts dataset + discounts_df = pd.read_csv('fortnightly_discounts.csv') + discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False) + + # Clean numeric columns - fix the data type issue + discount_columns = [col for col in discounts_df.columns if '-' in col] # Fortnight cols like Jan-01 + for col in discount_columns: + # Clean any problematic values like "25 25" or other string issues + discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False) + discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0) + + discounts_dict = discounts_df.set_index('Subcategory').to_dict('index') + + # Set the current date + current_date = datetime(2025, 8, 19) + + # Generate historical dates (104 weeks = 2 years) + dates = pd.date_range(end=current_date, periods=104, freq='W') + + # Parse weights and calculate price per 100g + df['grams'] = df['weights'].apply(parse_to_grams) + df['price_per_100g'] = df.apply( + lambda row: (row['item_price'] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None, + axis=1 + ) + + print(f"Generating enhanced historical data for {len(df)} products over {len(dates)} weeks...") + + all_historical_data = [] + + for idx, (_, row) in enumerate(df.iterrows()): + if idx % 1000 == 0: + print(f"Processing product {idx+1}/{len(df)}") + + subcat = row['subcat'] + category = row['category'] + base_price = row['item_price'] + + # Get discount rule for this subcat + rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {})) + apply_prob = subcat_apply_probs.get(subcat, 0.2) + + for date in dates: + # Get base discount for this fortnight + fortnight = get_fortnight_col(date) + base_discount_pct = float(rule.get(fortnight, 0)) / 100 + + # Apply enhanced factors + normal_price, discount_pct, events = apply_enhanced_factors( + date, category, subcat, base_price, base_discount_pct + ) + + # Determine if promotion is active + promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob) + if promo_flag: + discount_pct = discount_pct + np.random.normal(0, 0.05) + discount_pct = np.clip(discount_pct, 0, 0.9) + final_discount = discount_pct + else: + final_discount = 0 + + discounted_price = normal_price * (1 - final_discount) + + # Store the record + record = { + 'date': date, + 'product_code': row['product_code'], + 'category': category, + 'essential_flag': row['essential_flag'], + 'item_name': row['item_name'], + 'subcat': subcat, + 'weights': row['weights'], + 'unit_price': row.get('unit_price', 0), + 'brand_name': row['brand_name'], + 'grams': row['grams'], + 'price_per_100g': (discounted_price / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None, + 'normal_price': round(normal_price, 2), + 'promo_flag': int(promo_flag), + 'discount_pct': round(final_discount, 4), + 'discounted_price': round(discounted_price, 2), + 'price_multiplier': round(normal_price / base_price, 3), + 'events_active': '; '.join(events) if events else 'None' + } + + all_historical_data.append(record) + + # Convert to DataFrame + historical_df = pd.DataFrame(all_historical_data) + + print(f"Generated {len(historical_df):,} historical records") + return historical_df + +# %% +# SIMPLIFIED CATEGORY-BASED GENERATION (ALTERNATIVE APPROACH) +# ========================================================== + +def generate_synthetic_for_category(category_df, dates): + """Generate synthetic data for a specific category""" + synthetic_dfs = [] + + for _, row in category_df.iterrows(): + subcat = row['subcat'] + category = row['category'] + base_price = row['item_price'] + rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {})) + apply_prob = subcat_apply_probs.get(subcat, 0.2) + + item_df = pd.DataFrame({'date': dates}) + item_df['product_code'] = row['product_code'] + item_df['item_name'] = row['item_name'] + item_df['brand_name'] = row['brand_name'] + item_df['weights'] = row['weights'] + item_df['grams'] = row['grams'] + item_df['price_per_100g'] = row['price_per_100g'] + item_df['subcat'] = subcat + item_df['category'] = category + + item_df['normal_price'] = np.nan + item_df['discount_pct'] = 0.0 + item_df['promo_flag'] = 0 + item_df['discounted_price'] = np.nan + + for i, date in enumerate(dates): + fortnight = get_fortnight_col(date) + base_discount_pct = float(rule.get(fortnight, 0)) / 100 + normal_price, discount_pct = apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct)[:2] + + promo_flag = (discount_pct > 0) and (np.random.rand() < apply_prob) + item_df.at[i, 'normal_price'] = normal_price + item_df.at[i, 'promo_flag'] = 1 if promo_flag else 0 + item_df.at[i, 'discount_pct'] = discount_pct if promo_flag else 0 + item_df.at[i, 'discounted_price'] = normal_price * (1 - item_df.at[i, 'discount_pct']) + + synthetic_dfs.append(item_df) + return pd.concat(synthetic_dfs, ignore_index=True) + +# %% +# EXPLORATORY DATA ANALYSIS FUNCTIONS +# =================================== + +def create_essential_eda(df): + """Create 5 essential EDA plots""" + + print("Creating Essential EDA Plots...") + + # Setup modern style + sns.set_style("whitegrid") + plt.rcParams['figure.figsize'] = (12, 6) + plt.rcParams['font.size'] = 12 + + # 1. Time Series Analysis - Average Prices by Category + monthly_avg = df.groupby([df['date'].dt.to_period('M'), 'category'])['discounted_price'].mean().reset_index() + monthly_avg['date'] = monthly_avg['date'].dt.to_timestamp() + + fig1 = px.line( + monthly_avg, + x='date', + y='discounted_price', + color='category', + title='1. Average Monthly Prices by Category Over Time', + labels={'discounted_price': 'Average Price (AUD)', 'date': 'Date'} + ) + fig1.update_layout(height=600, hovermode='x unified') + fig1.show() + + # 2. Event Impact Analysis + event_impact = df.copy() + event_impact['has_events'] = event_impact['events_active'] != 'None' + comparison = event_impact.groupby(['category', 'has_events'])['price_multiplier'].mean().reset_index() + comparison['event_status'] = comparison['has_events'].map({True: 'With Events', False: 'Normal'}) + + fig2 = px.bar( + comparison, + x='category', + y='price_multiplier', + color='event_status', + title='2. Price Impact: Normal vs Event Periods', + labels={'price_multiplier': 'Average Price Multiplier'}, + barmode='group' + ) + fig2.update_layout(height=600) + fig2.update_xaxes(tickangle=45) + fig2.show() + + # 3. Seasonal Patterns Heatmap + df['month'] = df['date'].dt.month + seasonal_data = df.groupby(['category', 'month'])['discounted_price'].mean().reset_index() + seasonal_pivot = seasonal_data.pivot(index='category', columns='month', values='discounted_price') + + fig3 = go.Figure(data=go.Heatmap( + z=seasonal_pivot.values, + x=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], + y=seasonal_pivot.index, + colorscale='RdYlBu_r', + text=np.round(seasonal_pivot.values, 2), + texttemplate="%{text}", + textfont={"size": 10}, + hoverongaps=False + )) + fig3.update_layout( + title='3. Seasonal Price Patterns by Category', + height=600, + xaxis_title="Month", + yaxis_title="Category" + ) + fig3.show() + + # 4. Price Volatility by Category + volatility = df.groupby(['category', df['date'].dt.to_period('M')])['discounted_price'].agg(['mean', 'std']).reset_index() + volatility['date'] = volatility['date'].dt.to_timestamp() + volatility['cv'] = volatility['std'] / volatility['mean'] * 100 # Coefficient of variation + + fig4 = px.line( + volatility, + x='date', + y='cv', + color='category', + title='4. Price Volatility (Coefficient of Variation) by Category', + labels={'cv': 'Coefficient of Variation (%)', 'date': 'Date'} + ) + fig4.update_layout(height=600) + fig4.show() + + # 5. Distribution Overview + fig5 = make_subplots( + rows=2, cols=2, + subplot_titles=('Price Distribution', 'Discount Distribution', 'Promotion Rate by Category', 'Price Multiplier Distribution'), + specs=[[{'type': 'histogram'}, {'type': 'histogram'}], + [{'type': 'bar'}, {'type': 'histogram'}]] + ) + + # Price distribution + fig5.add_trace( + go.Histogram(x=df['discounted_price'], nbinsx=50, name='Price Distribution'), + row=1, col=1 + ) + + # Discount distribution (only when discount > 0) + discount_data = df[df['discount_pct'] > 0]['discount_pct'] * 100 + fig5.add_trace( + go.Histogram(x=discount_data, nbinsx=30, name='Discount Distribution'), + row=1, col=2 + ) + + # Promotion rates by category + promo_rates = df.groupby('category')['promo_flag'].mean() * 100 + fig5.add_trace( + go.Bar(x=promo_rates.index, y=promo_rates.values, name='Promotion Rate %'), + row=2, col=1 + ) + + # Price multiplier distribution + fig5.add_trace( + go.Histogram(x=df['price_multiplier'], nbinsx=50, name='Price Multiplier'), + row=2, col=2 + ) + + fig5.update_layout( + title="5. Data Distribution Overview", + height=800, + showlegend=False + ) + fig5.update_xaxes(tickangle=45, row=2, col=1) + fig5.show() + + # Summary Statistics + print("\nSUMMARY STATISTICS:") + print("="*50) + + for category in df['category'].unique(): + cat_data = df[df['category'] == category] + print(f"\n{category}:") + print(f" • Records: {len(cat_data):,}") + print(f" • Avg Price: ${cat_data['discounted_price'].mean():.2f}") + print(f" • Price Range: ${cat_data['discounted_price'].min():.2f} - ${cat_data['discounted_price'].max():.2f}") + print(f" • Promotion Rate: {cat_data['promo_flag'].mean()*100:.1f}%") + print(f" • Avg Discount: {cat_data['discount_pct'].mean()*100:.1f}%") + print(f" • Avg Price Multiplier: {cat_data['price_multiplier'].mean():.2f}x") + +def analyze_time_series_decomposition(df, category='Meat & Seafood'): + """Perform time series decomposition for a specific category""" + + print(f"\nTime Series Decomposition Analysis for {category}") + + # Aggregate data by date for the category + ts_data = df[df['category'] == category].groupby('date')['discounted_price'].mean() + ts_data = ts_data.asfreq('W', method='ffill') + + if len(ts_data) >= 52: # Need at least 1 year + try: + decomposition = seasonal_decompose(ts_data, model='multiplicative', period=52) + + fig = make_subplots( + rows=4, cols=1, + subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'), + vertical_spacing=0.08 + ) + + fig.add_trace(go.Scatter(x=ts_data.index, y=ts_data.values, + mode='lines', name='Original'), row=1, col=1) + fig.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend.values, + mode='lines', name='Trend'), row=2, col=1) + fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal.values, + mode='lines', name='Seasonal'), row=3, col=1) + fig.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid.values, + mode='lines', name='Residual'), row=4, col=1) + + fig.update_layout( + title=f'Time Series Decomposition - {category}', + height=1000, + showlegend=False + ) + fig.show() + + print(f"Decomposition completed for {category}") + return decomposition + + except Exception as e: + print(f"Decomposition failed: {e}") + return None + else: + print(f"Insufficient data for decomposition ({len(ts_data)} weeks)") + return None + +# %% +# MAIN EXECUTION FUNCTIONS +# ======================== + +def run_complete_analysis(): + """Run the complete enhanced historical data generation and analysis""" + + print("Starting Enhanced Historical Data Generation") + print("="*60) + + # Generate enhanced historical data + historical_df = generate_enhanced_historical_data() + + # Save to CSV + output_file = 'enhanced_historical_data.csv' + historical_df.to_csv(output_file, index=False) + print(f"Saved to {output_file}") + + # Create essential EDA plots + print("\nCreating Essential EDA Plots...") + create_essential_eda(historical_df) + + # Event analysis summary + event_records = historical_df[historical_df['events_active'] != 'None'] + if len(event_records) > 0: + print(f"\nEvent Impact Summary:") + print(f" • {len(event_records):,} records affected by events ({len(event_records)/len(historical_df)*100:.1f}%)") + + # Count events + all_events = [] + for events in event_records['events_active']: + all_events.extend([e.split(':')[0].strip() for e in events.split(';')]) + + from collections import Counter + event_counts = Counter(all_events) + print("\nTop 10 Most Frequent Events:") + for event, count in event_counts.most_common(10): + print(f" • {event}: {count:,} occurrences") + + print(f"\nAnalysis Complete!") + print(f"Generated {len(historical_df):,} records for ARIMA/LSTM modeling") + print(f"Date range: {historical_df['date'].min().date()} to {historical_df['date'].max().date()}") + + return historical_df + +def run_category_based_generation(): + """Alternative approach: Generate data by category""" + + # Load data + df = pd.read_csv('coles_with_discounts.csv') + df.columns = df.columns.str.strip() + + # Load discounts dict + discounts_df = pd.read_csv('/Users/rajpatel/Desktop/fortnightly_discounts.csv') + discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False) + + # Clean numeric columns - fix the data type issue + discount_columns = [col for col in discounts_df.columns if '-' in col] + for col in discount_columns: + # Clean any problematic values like "25 25" or other string issues + discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False) + discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0) + + global discounts_dict + discounts_dict = discounts_df.set_index('Subcategory').to_dict('index') + + # Current date and historical dates + current_date = datetime(2025, 8, 19) + dates = pd.date_range(end=current_date, periods=104, freq='W') + + # Unique categories from dataset + unique_categories = df['category'].unique() + + # Process categories + all_category_data = [] + for cat in unique_categories: + cat_df = df[df['category'] == cat] + if not cat_df.empty: + synth_cat = generate_synthetic_for_category(cat_df, dates) + synth_cat.to_csv(f'synthetic_{cat.replace(" ", "_").replace("&", "and")}.csv', index=False) + all_category_data.append(synth_cat) + print(f"Generated for {cat}: {len(synth_cat)} rows") + + # Combine all categories + full_synth = pd.concat(all_category_data, ignore_index=True) + full_synth.to_csv('synthetic_historical_data_v6.csv', index=False) + + print(f"Complete dataset saved: {len(full_synth)} records") + return full_synth + +# %% +# DATA INSPECTION AND EDA OUTPUT CELL +# =================================== + +def inspect_and_analyze_data(): + """ + Complete data inspection and EDA output to see how the data is performing + This function will show you everything about your dataset + """ + + print("=" * 80) + print("GROCERY PRICE ANALYSIS - COMPLETE DATA INSPECTION") + print("=" * 80) + + try: + # Try to load existing enhanced data first + print("\n1. LOADING DATA...") + try: + df = pd.read_csv('enhanced_historical_data.csv') + df['date'] = pd.to_datetime(df['date']) + print("✓ Loaded existing enhanced historical data") + data_source = "Enhanced Historical Data" + except FileNotFoundError: + try: + df = pd.read_csv('synthetic_historical_data_v6.csv') + df['date'] = pd.to_datetime(df['date']) + print("✓ Loaded existing synthetic data") + data_source = "Synthetic Historical Data" + except FileNotFoundError: + print("⚠ No existing data found. Generating new data...") + df = run_complete_analysis() + data_source = "Newly Generated Data" + + print(f"Data Source: {data_source}") + print(f"Data Shape: {df.shape[0]:,} rows × {df.shape[1]} columns") + + # 2. DATA QUALITY ASSESSMENT + print("\n" + "=" * 80) + print("2. DATA QUALITY ASSESSMENT") + print("=" * 80) + + print(f"\nDATE RANGE:") + print(f" • Start Date: {df['date'].min().strftime('%Y-%m-%d')}") + print(f" • End Date: {df['date'].max().strftime('%Y-%m-%d')}") + print(f" • Total Weeks: {df['date'].nunique()}") + print(f" • Date Coverage: {(df['date'].max() - df['date'].min()).days} days") + + print(f"\nMISSING VALUES:") + missing = df.isnull().sum() + missing_pct = (missing / len(df) * 100).round(2) + for col in df.columns: + if missing[col] > 0: + print(f" • {col}: {missing[col]:,} ({missing_pct[col]}%)") + + print(f"\nDATA TYPES:") + for col, dtype in df.dtypes.items(): + unique_count = df[col].nunique() + print(f" • {col}: {dtype} ({unique_count:,} unique values)") + + print(f"\nCATEGORY BREAKDOWN:") + category_counts = df['category'].value_counts() + for cat, count in category_counts.items(): + pct = (count / len(df) * 100) + print(f" • {cat}: {count:,} records ({pct:.1f}%)") + + # 3. PRICE ANALYSIS + print("\n" + "=" * 80) + print("3. PRICE ANALYSIS") + print("=" * 80) + + print(f"\nOVERALL PRICE STATISTICS:") + price_stats = df['discounted_price'].describe() + print(f" • Mean Price: ${price_stats['mean']:.2f}") + print(f" • Median Price: ${price_stats['50%']:.2f}") + print(f" • Price Range: ${price_stats['min']:.2f} - ${price_stats['max']:.2f}") + print(f" • Standard Deviation: ${price_stats['std']:.2f}") + print(f" • 25th Percentile: ${price_stats['25%']:.2f}") + print(f" • 75th Percentile: ${price_stats['75%']:.2f}") + + print(f"\nPRICE BY CATEGORY:") + for category in df['category'].unique(): + cat_data = df[df['category'] == category]['discounted_price'] + print(f" • {category}:") + print(f" - Mean: ${cat_data.mean():.2f}") + print(f" - Median: ${cat_data.median():.2f}") + print(f" - Range: ${cat_data.min():.2f} - ${cat_data.max():.2f}") + + # 4. PROMOTION AND DISCOUNT ANALYSIS + print("\n" + "=" * 80) + print("4. PROMOTION AND DISCOUNT ANALYSIS") + print("=" * 80) + + overall_promo_rate = df['promo_flag'].mean() * 100 + print(f"\nOVERALL PROMOTION RATE: {overall_promo_rate:.1f}%") + + promoted_items = df[df['promo_flag'] == 1] + if len(promoted_items) > 0: + avg_discount = promoted_items['discount_pct'].mean() * 100 + print(f"AVERAGE DISCOUNT (when promoted): {avg_discount:.1f}%") + + print(f"\nPROMOTION RATES BY CATEGORY:") + for category in df['category'].unique(): + cat_promo_rate = df[df['category'] == category]['promo_flag'].mean() * 100 + cat_avg_discount = df[(df['category'] == category) & (df['promo_flag'] == 1)]['discount_pct'].mean() * 100 + print(f" • {category}: {cat_promo_rate:.1f}% promo rate, {cat_avg_discount:.1f}% avg discount") + + # 5. EVENT IMPACT ANALYSIS + print("\n" + "=" * 80) + print("5. EVENT IMPACT ANALYSIS") + print("=" * 80) + + if 'events_active' in df.columns: + event_records = df[df['events_active'] != 'None'] + event_impact_rate = len(event_records) / len(df) * 100 + print(f"\nEVENT IMPACT RATE: {event_impact_rate:.1f}% of records affected by events") + + if len(event_records) > 0: + # Event frequency analysis + all_events = [] + for events in event_records['events_active']: + if pd.notna(events) and events != 'None': + all_events.extend([e.split(':')[0].strip() for e in str(events).split(';')]) + + if all_events: + from collections import Counter + event_counts = Counter(all_events) + print(f"\nTOP 10 MOST FREQUENT EVENTS:") + for event, count in event_counts.most_common(10): + pct = (count / len(df) * 100) + print(f" • {event}: {count:,} occurrences ({pct:.2f}%)") + + # Price impact of events + normal_prices = df[df['events_active'] == 'None']['discounted_price'].mean() + event_prices = event_records['discounted_price'].mean() + price_impact = ((event_prices - normal_prices) / normal_prices * 100) + print(f"\nPRICE IMPACT OF EVENTS:") + print(f" • Normal periods avg price: ${normal_prices:.2f}") + print(f" • Event periods avg price: ${event_prices:.2f}") + print(f" • Price increase during events: {price_impact:.1f}%") + + # 6. TEMPORAL PATTERNS + print("\n" + "=" * 80) + print("6. TEMPORAL PATTERNS") + print("=" * 80) + + df['month'] = df['date'].dt.month + df['year'] = df['date'].dt.year + df['quarter'] = df['date'].dt.quarter + + print(f"\nMONTHLY PRICE PATTERNS:") + monthly_prices = df.groupby('month')['discounted_price'].mean() + month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + for month, price in monthly_prices.items(): + print(f" • {month_names[month-1]}: ${price:.2f}") + + print(f"\nQUARTERLY TRENDS:") + quarterly_prices = df.groupby('quarter')['discounted_price'].mean() + for quarter, price in quarterly_prices.items(): + print(f" • Q{quarter}: ${price:.2f}") + + # 7. CREATE ALL VISUALIZATIONS + print("\n" + "=" * 80) + print("7. CREATING VISUALIZATIONS") + print("=" * 80) + + create_essential_eda(df) + + # 8. ADDITIONAL INSIGHTS + print("\n" + "=" * 80) + print("8. KEY INSIGHTS AND RECOMMENDATIONS") + print("=" * 80) + + # Price volatility analysis + price_volatility = df.groupby('category')['discounted_price'].std().sort_values(ascending=False) + print(f"\nMOST VOLATILE CATEGORIES (by price std dev):") + for category, volatility in price_volatility.head().items(): + print(f" • {category}: ${volatility:.2f} std dev") + + # Best promotion opportunities + low_promo_categories = df.groupby('category')['promo_flag'].mean().sort_values().head() + print(f"\nCATEGORIES WITH LOWEST PROMOTION RATES (opportunities):") + for category, rate in low_promo_categories.items(): + print(f" • {category}: {rate*100:.1f}% promotion rate") + + # Seasonal opportunities + summer_winter_diff = df[df['month'].isin([12, 1, 2])]['discounted_price'].mean() - df[df['month'].isin([6, 7, 8])]['discounted_price'].mean() + print(f"\nSEASONAL PRICE DIFFERENCE:") + print(f" • Summer vs Winter avg price difference: ${summer_winter_diff:.2f}") + + print("\n" + "=" * 80) + print("9. DATA QUALITY SUMMARY") + print("=" * 80) + + # Data completeness score + completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100 + print(f"\nDATA COMPLETENESS: {completeness:.1f}%") + + # Data distribution health + price_outliers = len(df[df['discounted_price'] > df['discounted_price'].quantile(0.99)]) + print(f"PRICE OUTLIERS (>99th percentile): {price_outliers:,} records ({price_outliers/len(df)*100:.2f}%)") + + # Time series regularity + date_gaps = df['date'].drop_duplicates().sort_values().diff().dropna() + regular_intervals = (date_gaps == pd.Timedelta(days=7)).mean() * 100 + print(f"TIME SERIES REGULARITY: {regular_intervals:.1f}% regular weekly intervals") + + print(f"\n" + "=" * 80) + print("ANALYSIS COMPLETE!") + print("=" * 80) + print(f"Dataset is ready for ARIMA/LSTM modeling") + print(f"Key features available: prices, promotions, events, seasonality") + print(f"Recommended next steps:") + print(f" 1. Use price_multiplier and events_active as external regressors") + print(f" 2. Consider category-specific models due to different volatility patterns") + print(f" 3. Leverage seasonal decomposition for feature engineering") + + return df + + except Exception as e: + print(f" Error during analysis: {str(e)}") + import traceback + traceback.print_exc() + return None + +# %% +# RUN COMPLETE INSPECTION AND ANALYSIS +# ==================================== + +# Execute this cell to see all data performance metrics and visualizations +df_analyzed = inspect_and_analyze_data() + +# If you want to run time series decomposition on specific categories: +if df_analyzed is not None and len(df_analyzed) > 0: + print("\n" + "="*80) + print("BONUS: TIME SERIES DECOMPOSITION") + print("="*80) + + # Run decomposition for top 3 categories + top_categories = df_analyzed['category'].value_counts().head(3).index + + for category in top_categories: + print(f"\nAnalyzing {category}...") + decomp_result = analyze_time_series_decomposition(df_analyzed, category) + if decomp_result is not None: + print(f"✓ Decomposition completed for {category}") + else: + print(f"⚠ Could not decompose {category} - insufficient data") + +print("\n🎉 Complete analysis finished! All outputs are displayed above.") + +# %% +# FEATURE ENGINEERING MODULE +# ========================== + +def create_lag_features(df, price_col='discounted_price', product_id_col='product_code'): + """Create lag features for time series modeling""" + + print("Creating lag features...") + df = df.sort_values([product_id_col, 'date']).copy() + + # Create lag features per product + lag_features = ['price_lag_1', 'price_lag_7', 'price_lag_52'] + lag_periods = [1, 7, 52] # 1 week, 7 weeks, 52 weeks (yearly) + + for feature, lag in zip(lag_features, lag_periods): + df[feature] = df.groupby(product_id_col)[price_col].shift(lag) + + # Calculate price changes + df['price_change_1w'] = df[price_col] - df['price_lag_1'] + df['price_change_7w'] = df[price_col] - df['price_lag_7'] + df['price_change_52w'] = df[price_col] - df['price_lag_52'] + + # Calculate percentage changes + df['price_pct_change_1w'] = (df['price_change_1w'] / df['price_lag_1']).fillna(0) + df['price_pct_change_7w'] = (df['price_change_7w'] / df['price_lag_7']).fillna(0) + df['price_pct_change_52w'] = (df['price_change_52w'] / df['price_lag_52']).fillna(0) + + print(f"✓ Created {len(lag_features)} lag features and 6 change features") + return df + +def create_moving_averages(df, price_col='discounted_price', product_id_col='product_code'): + """Create moving average features""" + + print("Creating moving average features...") + df = df.sort_values([product_id_col, 'date']).copy() + + # Define moving average windows + windows = [7, 30, 90] # 7, 30, 90 days (converted to weeks: ~1, 4, 13 weeks) + week_windows = [1, 4, 13] + + for window, week_window in zip(windows, week_windows): + col_name = f'ma_{window}d' + df[col_name] = df.groupby(product_id_col)[price_col].rolling( + window=week_window, min_periods=1 + ).mean().reset_index(0, drop=True) + + # Calculate deviation from moving average + df[f'price_dev_ma_{window}d'] = df[price_col] - df[col_name] + df[f'price_dev_pct_ma_{window}d'] = (df[f'price_dev_ma_{window}d'] / df[col_name]).fillna(0) + + print(f"✓ Created {len(windows)} moving averages and {len(windows)*2} deviation features") + return df + +def create_event_indicators(df, events_col='events_active'): + """Create binary indicators from events_active column""" + + print("Creating event indicator features...") + + # Initialize event indicator columns + event_indicators = { + 'has_seasonal_event': 0, + 'has_holiday_event': 0, + 'has_weather_event': 0, + 'has_supply_chain_event': 0, + 'has_disease_event': 0, + 'has_market_shock': 0, + 'has_competitor_pressure': 0, + 'event_count': 0 + } + + # Add columns + for col in event_indicators.keys(): + df[col] = 0 + + # Process each row + for idx, row in df.iterrows(): + events = str(row[events_col]).lower() + event_count = 0 + + if events != 'none' and events != 'nan': + event_list = [e.strip() for e in events.split(';')] + event_count = len(event_list) + + for event in event_list: + if any(season in event for season in ['summer', 'winter', 'spring', 'autumn']): + df.at[idx, 'has_seasonal_event'] = 1 + elif any(holiday in event for holiday in ['christmas', 'easter', 'new year', 'valentine', 'mother', 'father']): + df.at[idx, 'has_holiday_event'] = 1 + elif 'weather' in event: + df.at[idx, 'has_weather_event'] = 1 + elif 'supply chain' in event: + df.at[idx, 'has_supply_chain_event'] = 1 + elif 'disease' in event: + df.at[idx, 'has_disease_event'] = 1 + elif 'market shock' in event: + df.at[idx, 'has_market_shock'] = 1 + elif 'competitor' in event: + df.at[idx, 'has_competitor_pressure'] = 1 + + df.at[idx, 'event_count'] = event_count + + print(f"✓ Created {len(event_indicators)} event indicator features") + return df + +def create_calendar_features(df, date_col='date'): + """Create calendar-based features""" + + print("Creating calendar features...") + + # Basic calendar features + df['year'] = df[date_col].dt.year + df['quarter'] = df[date_col].dt.quarter + df['month'] = df[date_col].dt.month + df['week_of_year'] = df[date_col].dt.isocalendar().week + df['day_of_year'] = df[date_col].dt.dayofyear + df['is_month_start'] = df[date_col].dt.is_month_start.astype(int) + df['is_month_end'] = df[date_col].dt.is_month_end.astype(int) + df['is_quarter_start'] = df[date_col].dt.is_quarter_start.astype(int) + df['is_quarter_end'] = df[date_col].dt.is_quarter_end.astype(int) + + # Season indicators (Southern Hemisphere) + def get_season_indicator(month): + if month in [12, 1, 2]: + return 'summer' + elif month in [3, 4, 5]: + return 'autumn' + elif month in [6, 7, 8]: + return 'winter' + else: + return 'spring' + + df['season'] = df['month'].apply(get_season_indicator) + + # Create season dummies + for season in ['summer', 'autumn', 'winter', 'spring']: + df[f'is_{season}'] = (df['season'] == season).astype(int) + + # Holiday indicators + def is_holiday_period(date): + # Check if date is within 2 weeks of major holidays + month, day = date.month, date.day + + # Christmas/New Year period + if (month == 12 and day >= 15) or (month == 1 and day <= 15): + return 1 + # Easter period (approximate - around April) + elif month == 4 and 1 <= day <= 30: + return 1 + # School holiday periods (approximate) + elif (month == 7 and 1 <= day <= 31) or (month == 1 and 16 <= day <= 31): + return 1 + return 0 + + df['is_holiday_period'] = df[date_col].apply(is_holiday_period) + + # School holiday flags + def is_school_holiday(date): + month, day = date.month, date.day + # Summer holidays: Dec 15 - Jan 31 + if (month == 12 and day >= 15) or (month == 1): + return 1 + # Winter holidays: July + elif month == 7: + return 1 + # Spring holidays: late September/early October + elif month == 9 and day >= 20: + return 1 + elif month == 10 and day <= 10: + return 1 + # Autumn holidays: April + elif month == 4: + return 1 + return 0 + + df['school_holiday_flag'] = df[date_col].apply(is_school_holiday) + + # Days to next major holiday + def days_to_holiday(date): + month, day = date.month, date.day + current_date = date.replace(hour=0, minute=0, second=0, microsecond=0) + + # Define major holidays for current year + year = date.year + holidays = [ + datetime(year, 1, 1), # New Year + datetime(year, 4, 15), # Easter (approximate) + datetime(year, 7, 1), # School holidays + datetime(year, 12, 25), # Christmas + ] + + # Find next holiday + future_holidays = [h for h in holidays if h >= current_date] + if not future_holidays: + # If no holidays left this year, add next year's New Year + future_holidays = [datetime(year + 1, 1, 1)] + + next_holiday = min(future_holidays) + return (next_holiday - current_date).days + + df['days_to_holiday'] = df[date_col].apply(days_to_holiday) + + # Cyclical encoding for month and week + df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) + df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12) + df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52) + df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52) + + print("✓ Created 25+ calendar-based features") + return df + +def create_volatility_measures(df, price_col='discounted_price', product_id_col='product_code'): + """Calculate price volatility measures per category and product""" + + print("Creating volatility measures...") + + # Sort data + df = df.sort_values([product_id_col, 'date']).copy() + + # Rolling volatility (standard deviation) over different windows + windows = [4, 13, 26] # 1 month, 3 months, 6 months (in weeks) + + for window in windows: + # Rolling standard deviation + df[f'volatility_{window}w'] = df.groupby(product_id_col)[price_col].rolling( + window=window, min_periods=2 + ).std().reset_index(0, drop=True) + + # Coefficient of variation (volatility relative to mean) + rolling_mean = df.groupby(product_id_col)[price_col].rolling( + window=window, min_periods=2 + ).mean().reset_index(0, drop=True) + + df[f'cv_{window}w'] = (df[f'volatility_{window}w'] / rolling_mean).fillna(0) + + # Category-level volatility measures + category_volatility = df.groupby(['category', 'date'])[price_col].agg(['mean', 'std']).reset_index() + category_volatility['category_cv'] = category_volatility['std'] / category_volatility['mean'] + category_volatility = category_volatility[['category', 'date', 'category_cv']].fillna(0) + + # Merge back to main dataframe + df = df.merge(category_volatility, on=['category', 'date'], how='left') + + # Price dispersion within category-date + df['price_rank_in_category'] = df.groupby(['category', 'date'])[price_col].rank(pct=True) + + print(f"✓ Created {len(windows)*2 + 2} volatility and ranking features") + return df + +def engineer_all_features(df): + """Main function to engineer all features""" + + print("=" * 60) + print("FEATURE ENGINEERING PIPELINE") + print("=" * 60) + + print(f"Starting with {df.shape[0]:,} records and {df.shape[1]} features") + + # Apply all feature engineering steps + df = create_lag_features(df) + df = create_moving_averages(df) + df = create_event_indicators(df) + df = create_calendar_features(df) + df = create_volatility_measures(df) + + print("=" * 60) + print(f"Feature engineering complete!") + print(f"Final shape: {df.shape[0]:,} records and {df.shape[1]} features") + print(f"Added {df.shape[1] - 18} new features") # Original had 18 columns + + # Show feature summary + new_features = [col for col in df.columns if col not in [ + 'date', 'product_code', 'category', 'essential_flag', 'item_name', + 'subcat', 'weights', 'unit_price', 'brand_name', 'grams', + 'price_per_100g', 'normal_price', 'promo_flag', 'discount_pct', + 'discounted_price', 'price_multiplier', 'events_active', 'month' + ]] + + print(f"\nNew features created:") + feature_categories = { + 'Lag Features': [f for f in new_features if 'lag' in f or 'change' in f], + 'Moving Averages': [f for f in new_features if 'ma_' in f or 'dev' in f], + 'Event Indicators': [f for f in new_features if 'has_' in f or 'event_count' in f], + 'Calendar Features': [f for f in new_features if any(x in f for x in ['year', 'quarter', 'week', 'day', 'season', 'holiday', 'sin', 'cos'])], + 'Volatility Measures': [f for f in new_features if 'volatility' in f or 'cv_' in f or 'rank' in f] + } + + for category, features in feature_categories.items(): + print(f" • {category}: {len(features)} features") + + # Check for missing values in new features + missing_summary = df[new_features].isnull().sum() + missing_features = missing_summary[missing_summary > 0] + + if len(missing_features) > 0: + print(f"\nMissing values in new features:") + for feature, count in missing_features.items(): + print(f" • {feature}: {count:,} ({count/len(df)*100:.1f}%)") + else: + print(f"\n✓ No missing values in engineered features") + + return df + +# %% +# FEATURE ENGINEERING EXECUTION CELL +# ================================== + +def run_feature_engineering(): + """Execute feature engineering on the enhanced dataset""" + + try: + # Load the enhanced dataset + print("Loading enhanced historical data...") + df = pd.read_csv('enhanced_historical_data.csv') + df['date'] = pd.to_datetime(df['date']) + + print(f"Loaded {len(df):,} records from enhanced_historical_data.csv") + + # Run feature engineering + df_engineered = engineer_all_features(df) + + # Save the engineered dataset + output_file = 'grocery_data_with_features.csv' + df_engineered.to_csv(output_file, index=False) + + print(f"\n✓ Feature-engineered dataset saved to: {output_file}") + + # Quick validation + print(f"\nFeature Engineering Validation:") + print(f" • Original features: 18") + print(f" • New features: {df_engineered.shape[1] - 18}") + print(f" • Total features: {df_engineered.shape[1]}") + print(f" • Data completeness: {(1 - df_engineered.isnull().sum().sum() / (len(df_engineered) * len(df_engineered.columns))) * 100:.1f}%") + + return df_engineered + + except FileNotFoundError: + print("❌ enhanced_historical_data.csv not found.") + print("Please run the data generation first using: df = run_complete_analysis()") + return None + except Exception as e: + print(f"❌ Error during feature engineering: {str(e)}") + import traceback + traceback.print_exc() + return None + +print("Feature engineering module loaded!") +print("Execute: df_with_features = run_feature_engineering()") + +df_with_features = run_feature_engineering() + +# %% +# ADVANCED MODEL ARCHITECTURE MODULE +# ================================== + +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error +from sklearn.model_selection import TimeSeriesSplit +import warnings +warnings.filterwarnings('ignore') + +# Time series modeling imports +try: + from statsmodels.tsa.arima.model import ARIMA + from statsmodels.tsa.statespace.sarimax import SARIMAX + from statsmodels.tsa.vector_ar.var_model import VAR + from statsmodels.stats.diagnostic import acorr_ljungbox + from statsmodels.tsa.stattools import adfuller + HAS_STATSMODELS = True +except ImportError: + print("⚠ statsmodels not available. Install with: pip install statsmodels") + HAS_STATSMODELS = False + +# Deep learning imports +try: + import tensorflow as tf + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import LSTM, Dense, Dropout, Input + from tensorflow.keras.optimizers import Adam + from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau + HAS_TENSORFLOW = True +except ImportError: + print("⚠ TensorFlow not available. Install with: pip install tensorflow") + HAS_TENSORFLOW = False + +# %% +# DATA PREPARATION FOR MODELING +# ============================= + +class ModelDataPreparator: + """Prepare data for different model types""" + + def __init__(self, df): + self.df = df.copy() + self.scalers = {} + + def prepare_arimax_data(self, category, min_weeks=52): + """Prepare data for ARIMAX modeling""" + + # Filter category and products with sufficient history + cat_data = self.df[self.df['category'] == category].copy() + + # FIX 3: Filter Training Data - Only use complete records + cat_data = cat_data[cat_data['price_lag_52'].notna()] # Ensure 52-week history + + if len(cat_data) == 0: + print(f" No data available for {category} with 52-week history") + return None + + # Aggregate to category level (weekly averages) + weekly_data = cat_data.groupby('date').agg({ + 'discounted_price': 'mean', + 'promo_flag': 'mean', + 'has_holiday_event': 'max', + 'has_seasonal_event': 'max', + 'has_weather_event': 'max', + 'has_supply_chain_event': 'max', + 'school_holiday_flag': 'max', + 'month_sin': 'first', + 'month_cos': 'first', + 'event_count': 'mean' + }).reset_index().sort_values('date') + + return weekly_data + + def prepare_lstm_data(self, category, sequence_length=12, test_size=0.2): + """Prepare data for LSTM modeling with sequences""" + + cat_data = self.df[self.df['category'] == category].copy() + + # FIX 3: Filter Training Data - Only use complete records + cat_data = cat_data[cat_data['price_lag_52'].notna()] + + if len(cat_data) == 0: + return None, None, None, None + + # Select features for LSTM + feature_cols = [ + 'discounted_price', 'price_lag_1', 'price_lag_7', 'ma_7d', 'ma_30d', + 'promo_flag', 'has_holiday_event', 'has_seasonal_event', + 'month_sin', 'month_cos', 'volatility_4w' + ] + + # Aggregate to weekly category level + weekly_data = cat_data.groupby('date')[feature_cols].mean().reset_index() + weekly_data = weekly_data.sort_values('date') + + # Scale features + scaler = MinMaxScaler() + scaled_features = scaler.fit_transform(weekly_data[feature_cols]) + self.scalers[f'{category}_lstm'] = scaler + + # Create sequences + X, y = [], [] + for i in range(sequence_length, len(scaled_features)): + X.append(scaled_features[i-sequence_length:i]) # Past sequence_length weeks + y.append(scaled_features[i, 0]) # Current price (first column) + + X, y = np.array(X), np.array(y) + + # Train/test split (chronological) + split_idx = int(len(X) * (1 - test_size)) + X_train, X_test = X[:split_idx], X[split_idx:] + y_train, y_test = y[:split_idx], y[split_idx:] + + return X_train, X_test, y_train, y_test + + def prepare_var_data(self, related_categories, min_weeks=52): + """Prepare data for Vector Autoregression (related categories)""" + + # Filter data for related categories + related_data = self.df[self.df['category'].isin(related_categories)].copy() + related_data = related_data[related_data['price_lag_52'].notna()] + + # Create wide format (categories as columns) + pivot_data = related_data.groupby(['date', 'category'])['discounted_price'].mean().unstack() + pivot_data = pivot_data.dropna() # Remove rows with missing categories + + if len(pivot_data) < min_weeks: + print(f" Insufficient data for VAR model: {len(pivot_data)} weeks") + return None + + return pivot_data +# %% +# ARIMAX MODEL IMPLEMENTATION +# =========================== + +class ARIMAXModel: + """ARIMAX model with external regressors""" + + def __init__(self, order=(1,1,1), seasonal_order=(1,1,1,52)): + self.order = order + self.seasonal_order = seasonal_order + self.model = None + self.fitted_model = None + + def fit(self, data, target_col='discounted_price', exog_cols=None): + """Fit ARIMAX model""" + + if not HAS_STATSMODELS: + print(" statsmodels required for ARIMAX") + return False + + try: + y = data[target_col] + exog = data[exog_cols] if exog_cols else None + + # FIX 2: Address Stationarity - Apply differencing if needed + adf_result = adfuller(y.dropna()) + print(f"Stationarity test p-value: {adf_result[1]:.4f}") + + if adf_result[1] > 0.05: + print("⚠ Series not stationary, applying first differencing") + y_diff = y.diff().dropna() + + # Adjust exogenous variables to match differenced series + if exog is not None: + exog = exog.iloc[1:] # Remove first row to match differenced series + + # Update order to (1,0,1) since we manually differenced + model_order = (self.order[0], 0, self.order[2]) + y_model = y_diff + else: + print("✓ Series is stationary") + model_order = self.order + y_model = y + + # Fit SARIMAX model + self.model = SARIMAX( + y_model, + exog=exog, + order=model_order, + seasonal_order=self.seasonal_order, + enforce_stationarity=False, + enforce_invertibility=False + ) + + self.fitted_model = self.model.fit(disp=False, maxiter=200) + + print(f"✓ ARIMAX model fitted successfully") + return True + + except Exception as e: + print(f" Error fitting ARIMAX: {str(e)}") + return False + + def forecast(self, steps, exog=None): + """Generate forecasts""" + if self.fitted_model is None: + print(" Model not fitted") + return None + + try: + forecast = self.fitted_model.forecast(steps=steps, exog=exog) + conf_int = self.fitted_model.get_forecast(steps=steps, exog=exog).conf_int() + + return { + 'forecast': forecast, + 'conf_int': conf_int, + 'model_summary': self.fitted_model.summary() + } + except Exception as e: + print(f" Error forecasting: {str(e)}") + return None + + def get_diagnostics(self): + """Get model diagnostics""" + if self.fitted_model is None: + return None + + residuals = self.fitted_model.resid + + # Ljung-Box test for autocorrelation in residuals + lb_test = acorr_ljungbox(residuals, lags=10, return_df=True) + + return { + 'aic': self.fitted_model.aic, + 'bic': self.fitted_model.bic, + 'ljung_box': lb_test, + 'residuals': residuals + } + +# %% +# LSTM MODEL IMPLEMENTATION +# ======================== + +class LSTMModel: + """LSTM model for non-linear time series patterns""" + + def __init__(self, lstm_units=50, dropout=0.2, dense_units=25): + self.lstm_units = lstm_units + self.dropout = dropout + self.dense_units = dense_units + self.model = None + self.history = None + + def build_model(self, input_shape): + """Build LSTM architecture""" + + if not HAS_TENSORFLOW: + print(" TensorFlow required for LSTM") + return False + + self.model = Sequential([ + Input(shape=input_shape), + LSTM(self.lstm_units, return_sequences=True, dropout=self.dropout), + LSTM(self.lstm_units//2, dropout=self.dropout), + Dense(self.dense_units, activation='relu'), + Dropout(self.dropout), + Dense(1) + ]) + + self.model.compile( + optimizer=Adam(learning_rate=0.001), + loss='mse', + metrics=['mae'] + ) + + return True + + def fit(self, X_train, y_train, X_val=None, y_val=None, epochs=100, batch_size=32): + """Train LSTM model""" + + if self.model is None: + print(" Model not built") + return False + + # Callbacks + callbacks = [ + EarlyStopping(patience=15, restore_best_weights=True), + ReduceLROnPlateau(patience=8, factor=0.5, min_lr=1e-7) + ] + + # Validation data + validation_data = (X_val, y_val) if X_val is not None else None + + try: + self.history = self.model.fit( + X_train, y_train, + validation_data=validation_data, + epochs=epochs, + batch_size=batch_size, + callbacks=callbacks, + verbose=0 + ) + + print(f"✓ LSTM model trained for {len(self.history.history['loss'])} epochs") + return True + + except Exception as e: + print(f" Error training LSTM: {str(e)}") + return False + + def predict(self, X): + """Generate predictions""" + if self.model is None: + print(" Model not trained") + return None + + return self.model.predict(X, verbose=0) + + def get_training_history(self): + """Get training history""" + if self.history is None: + return None + + return { + 'loss': self.history.history['loss'], + 'val_loss': self.history.history.get('val_loss', []), + 'mae': self.history.history['mae'], + 'val_mae': self.history.history.get('val_mae', []) + } + +# %% +# ENSEMBLE MODEL IMPLEMENTATION +# ============================= + +class EnsembleModel: + """Ensemble combining ARIMA and LSTM predictions""" + + def __init__(self, arimax_weight=0.6, lstm_weight=0.4): + self.arimax_weight = arimax_weight + self.lstm_weight = lstm_weight + self.arimax_model = None + self.lstm_model = None + + def add_models(self, arimax_model, lstm_model): + """Add component models""" + self.arimax_model = arimax_model + self.lstm_model = lstm_model + + def predict(self, arimax_data, lstm_data, steps=1): + """Generate ensemble predictions""" + + if self.arimax_model is None or self.lstm_model is None: + print(" Component models not provided") + return None + + try: + # ARIMAX predictions + arimax_forecast = self.arimax_model.forecast(steps=steps, exog=arimax_data) + arimax_pred = arimax_forecast['forecast'] if isinstance(arimax_forecast, dict) else arimax_forecast + + # LSTM predictions + lstm_pred = self.lstm_model.predict(lstm_data) + + # Combine predictions + ensemble_pred = (self.arimax_weight * arimax_pred + + self.lstm_weight * lstm_pred.flatten()) + + return { + 'ensemble': ensemble_pred, + 'arimax': arimax_pred, + 'lstm': lstm_pred.flatten(), + 'weights': {'arimax': self.arimax_weight, 'lstm': self.lstm_weight} + } + + except Exception as e: + print(f" Error in ensemble prediction: {str(e)}") + return None + +# %% +# VAR MODEL IMPLEMENTATION +# ======================= + +class VARModel: + """Vector Autoregression for related product categories""" + + def __init__(self, maxlags=12): + self.maxlags = maxlags + self.model = None + self.fitted_model = None + + def fit(self, data): + """Fit VAR model""" + + if not HAS_STATSMODELS: + print(" statsmodels required for VAR") + return False + + try: + # Select optimal lag order + var_model = VAR(data) + lag_order = var_model.select_order(maxlags=self.maxlags) + optimal_lags = lag_order.aic # Use AIC for selection + + print(f"Optimal lag order: {optimal_lags}") + + # Fit VAR model + self.fitted_model = var_model.fit(optimal_lags) + + print(f"✓ VAR model fitted with {optimal_lags} lags") + return True + + except Exception as e: + print(f" Error fitting VAR: {str(e)}") + return False + + def forecast(self, steps): + """Generate VAR forecasts""" + if self.fitted_model is None: + print(" Model not fitted") + return None + + try: + forecast = self.fitted_model.forecast( + self.fitted_model.y, + steps=steps + ) + + return { + 'forecast': forecast, + 'model_summary': self.fitted_model.summary() + } + + except Exception as e: + print(f" Error forecasting: {str(e)}") + return None + +# %% +# MODEL EVALUATION FRAMEWORK +# ========================= + +class ModelEvaluator: + """Comprehensive model evaluation""" + + @staticmethod + def calculate_metrics(y_true, y_pred): + """Calculate evaluation metrics""" + + # Remove any NaN values + mask = ~(np.isnan(y_true) | np.isnan(y_pred)) + y_true_clean = y_true[mask] + y_pred_clean = y_pred[mask] + + if len(y_true_clean) == 0: + return None + + return { + 'MAE': mean_absolute_error(y_true_clean, y_pred_clean), + 'RMSE': np.sqrt(mean_squared_error(y_true_clean, y_pred_clean)), + 'MAPE': mean_absolute_percentage_error(y_true_clean, y_pred_clean) * 100, + 'R²': 1 - (np.sum((y_true_clean - y_pred_clean)**2) / + np.sum((y_true_clean - np.mean(y_true_clean))**2)) + } + + @staticmethod + def compare_models(results_dict): + """Compare multiple models""" + + comparison = pd.DataFrame({ + model: metrics for model, metrics in results_dict.items() + }).T + + # Rank models (lower is better for MAE, RMSE, MAPE; higher for R²) + comparison['MAE_rank'] = comparison['MAE'].rank() + comparison['RMSE_rank'] = comparison['RMSE'].rank() + comparison['MAPE_rank'] = comparison['MAPE'].rank() + comparison['R²_rank'] = comparison['R²'].rank(ascending=False) + + comparison['avg_rank'] = (comparison['MAE_rank'] + comparison['RMSE_rank'] + + comparison['MAPE_rank'] + comparison['R²_rank']) / 4 + + return comparison.sort_values('avg_rank') + +print("Advanced modeling architecture loaded!") +print("Dependencies available:") +print(f" • statsmodels (ARIMAX/VAR): {HAS_STATSMODELS}") +print(f" • tensorflow (LSTM): {HAS_TENSORFLOW}") +print("\nNext: Load your feature-engineered data and run model pipeline") + +# %% +# COMPLETE MODEL PIPELINE EXECUTION +# ================================= + +def run_advanced_model_pipeline(target_categories=None, forecast_horizon=12): + """ + Execute complete advanced modeling pipeline + + Parameters: + - target_categories: List of categories to model (default: top 3 stable categories) + - forecast_horizon: Number of weeks to forecast ahead + """ + + print("=" * 80) + print("ADVANCED MODEL PIPELINE EXECUTION") + print("=" * 80) + + try: + # Load feature-engineered data + print("Loading feature-engineered dataset...") + df = pd.read_csv('grocery_data_with_features.csv') + df['date'] = pd.to_datetime(df['date']) + + print(f"Loaded {len(df):,} records with {df.shape[1]} features") + + # Select target categories (stable ones for initial modeling) + if target_categories is None: + # Choose stable categories with good data coverage + category_stability = df.groupby('category').agg({ + 'discounted_price': ['count', 'std'], + 'price_multiplier': 'mean' + }).round(3) + + category_stability.columns = ['record_count', 'price_std', 'avg_multiplier'] + category_stability['stability_score'] = ( + category_stability['record_count'] / category_stability['price_std'] + ) + + # Select top 3 most stable categories + target_categories = category_stability.nlargest(3, 'stability_score').index.tolist() + + print(f"Target categories for modeling: {target_categories}") + + # Initialize data preparator + preparator = ModelDataPreparator(df) + + # Store all results + all_results = {} + + # Process each category + for category in target_categories: + print(f"\n" + "="*60) + print(f"MODELING CATEGORY: {category}") + print("="*60) + + category_results = model_single_category( + preparator, category, forecast_horizon + ) + + if category_results: + all_results[category] = category_results + print(f"✓ {category} modeling completed successfully") + else: + print(f" {category} modeling failed") + + # Cross-category VAR modeling + print(f"\n" + "="*60) + print("VECTOR AUTOREGRESSION (VAR) MODELING") + print("="*60) + + var_results = run_var_modeling(preparator, target_categories) + if var_results: + all_results['VAR_cross_category'] = var_results + + # Generate comprehensive report + print(f"\n" + "="*60) + print("MODEL PERFORMANCE SUMMARY") + print("="*60) + + generate_model_report(all_results) + + print(f"\n" + "="*80) + print("PIPELINE EXECUTION COMPLETE") + print("="*80) + + return all_results + + except Exception as e: + print(f" Pipeline execution failed: {str(e)}") + import traceback + traceback.print_exc() + return None + +def model_single_category(preparator, category, forecast_horizon): + """Model a single category with all approaches""" + + results = {} + + # 1. ARIMAX Modeling + print(f"\n1. ARIMAX MODELING FOR {category}") + print("-" * 40) + + arimax_data = preparator.prepare_arimax_data(category) + if arimax_data is not None and len(arimax_data) >= 52: + + # Define external regressors + exog_cols = [ + 'promo_flag', 'has_holiday_event', 'has_seasonal_event', + 'school_holiday_flag', 'month_sin', 'month_cos' + ] + + # Train/test split (80/20) + split_idx = int(len(arimax_data) * 0.8) + train_data = arimax_data.iloc[:split_idx] + test_data = arimax_data.iloc[split_idx:] + + # Fit ARIMAX model + arimax_model = ARIMAXModel(order=(1,1,1), seasonal_order=(1,1,1,52)) + + if arimax_model.fit(train_data, 'discounted_price', exog_cols): + + # Generate forecasts + test_exog = test_data[exog_cols] if len(test_data) > 0 else None + forecast_result = arimax_model.forecast(len(test_data), test_exog) + + if forecast_result: + # Evaluate performance + if len(test_data) > 0: + y_true = test_data['discounted_price'].values + y_pred = forecast_result['forecast'].values + + arimax_metrics = ModelEvaluator.calculate_metrics(y_true, y_pred) + + results['ARIMAX'] = { + 'model': arimax_model, + 'metrics': arimax_metrics, + 'forecast': forecast_result, + 'diagnostics': arimax_model.get_diagnostics() + } + + print(f"ARIMAX Metrics: MAE={arimax_metrics['MAE']:.3f}, " + f"RMSE={arimax_metrics['RMSE']:.3f}, " + f"MAPE={arimax_metrics['MAPE']:.1f}%") + else: + print(f" Insufficient data for ARIMAX modeling") + + # 2. LSTM Modeling + print(f"\n2. LSTM MODELING FOR {category}") + print("-" * 40) + + X_train, X_test, y_train, y_test = preparator.prepare_lstm_data(category) + + if X_train is not None and len(X_train) > 20: + + # Build and train LSTM + lstm_model = LSTMModel(lstm_units=50, dropout=0.2) + + if lstm_model.build_model(X_train.shape[1:]): + + # Split training data for validation + val_split = int(len(X_train) * 0.8) + X_train_split = X_train[:val_split] + X_val_split = X_train[val_split:] + y_train_split = y_train[:val_split] + y_val_split = y_train[val_split:] + + if lstm_model.fit(X_train_split, y_train_split, X_val_split, y_val_split, epochs=50): + + # Generate predictions + if len(X_test) > 0: + y_pred_scaled = lstm_model.predict(X_test) + + # FIX 1: Fix Data Scaling - Inverse transform predictions + scaler = preparator.scalers[f'{category}_lstm'] + + # Create dummy array for inverse transform (scaler expects all features) + dummy_array = np.zeros((len(y_pred_scaled), scaler.n_features_in_)) + dummy_array[:, 0] = y_pred_scaled.flatten() # Price is first feature + + # Inverse transform to get actual price scale + y_pred_actual = scaler.inverse_transform(dummy_array)[:, 0] + + # Also inverse transform actual values for fair comparison + dummy_array_actual = np.zeros((len(y_test), scaler.n_features_in_)) + dummy_array_actual[:, 0] = y_test + y_test_actual = scaler.inverse_transform(dummy_array_actual)[:, 0] + + lstm_metrics = ModelEvaluator.calculate_metrics(y_test_actual, y_pred_actual) + + results['LSTM'] = { + 'model': lstm_model, + 'metrics': lstm_metrics, + 'training_history': lstm_model.get_training_history(), + 'scaler': scaler # Store scaler for future use + } + + print(f"LSTM Metrics: MAE={lstm_metrics['MAE']:.3f}, " + f"RMSE={lstm_metrics['RMSE']:.3f}, " + f"MAPE={lstm_metrics['MAPE']:.1f}%") + else: + print(f" Insufficient data for LSTM modeling") + + # 3. Ensemble Modeling + if 'ARIMAX' in results and 'LSTM' in results: + print(f"\n3. ENSEMBLE MODELING FOR {category}") + print("-" * 40) + + # Create ensemble + ensemble = EnsembleModel(arimax_weight=0.6, lstm_weight=0.4) + ensemble.add_models(results['ARIMAX']['model'], results['LSTM']['model']) + + # Note: Ensemble prediction would require aligned data + # This is a simplified implementation + arimax_pred = results['ARIMAX']['forecast']['forecast'].values + lstm_pred = results['LSTM']['metrics'] # Placeholder + + print("✓ Ensemble model created (weights: 60% ARIMAX, 40% LSTM)") + results['Ensemble'] = {'model': ensemble, 'weights': {'ARIMAX': 0.6, 'LSTM': 0.4}} + + return results + +def run_var_modeling(preparator, categories): + """Run Vector Autoregression on related categories""" + + # Group related categories + related_groups = [ + ['Meat & Seafood', 'Frozen'], # Protein products + ['Dairy, Eggs & Fridge', 'Bakery'], # Fresh products + ['Pantry', 'Household'] # Shelf-stable products + ] + + var_results = {} + + for group in related_groups: + available_categories = [cat for cat in group if cat in categories] + + if len(available_categories) >= 2: + print(f"VAR modeling for: {available_categories}") + + var_data = preparator.prepare_var_data(available_categories) + + if var_data is not None and len(var_data) >= 52: + + # Train/test split + split_idx = int(len(var_data) * 0.8) + train_data = var_data.iloc[:split_idx] + test_data = var_data.iloc[split_idx:] + + # Fit VAR model + var_model = VARModel(maxlags=8) + + if var_model.fit(train_data): + + # Generate forecasts + forecast_result = var_model.forecast(len(test_data)) + + if forecast_result: + var_results[f"VAR_{'_'.join(available_categories)}"] = { + 'model': var_model, + 'forecast': forecast_result, + 'categories': available_categories + } + + print(f"✓ VAR model fitted for {available_categories}") + + return var_results if var_results else None + +def generate_model_report(all_results): + """Generate comprehensive model performance report""" + + if not all_results: + print("No results to report") + return + + print("MODEL PERFORMANCE COMPARISON:") + print("=" * 50) + + # Collect metrics for comparison + metrics_comparison = {} + + for category, models in all_results.items(): + if category.startswith('VAR'): + continue # Skip VAR for metrics comparison + + print(f"\nCategory: {category}") + print("-" * 30) + + for model_name, model_data in models.items(): + if 'metrics' in model_data and model_data['metrics']: + metrics = model_data['metrics'] + metrics_comparison[f"{category}_{model_name}"] = metrics + + print(f"{model_name:>8}: MAE={metrics['MAE']:.3f}, " + f"RMSE={metrics['RMSE']:.3f}, " + f"MAPE={metrics['MAPE']:.1f}%, " + f"R²={metrics['R²']:.3f}") + + # Overall comparison + if metrics_comparison: + print(f"\nOVERALL BEST PERFORMING MODELS:") + print("-" * 40) + + comparison_df = ModelEvaluator.compare_models(metrics_comparison) + + print("Top 5 models by average rank:") + for idx, (model, row) in enumerate(comparison_df.head().iterrows()): + print(f"{idx+1}. {model}: avg_rank={row['avg_rank']:.2f}") + + # Save results summary + results_summary = { + 'categories_modeled': len([k for k in all_results.keys() if not k.startswith('VAR')]), + 'models_trained': sum(len(v) for k, v in all_results.items() if not k.startswith('VAR')), + 'metrics_comparison': metrics_comparison + } + + print(f"\nSUMMARY:") + print(f" • Categories modeled: {results_summary['categories_modeled']}") + print(f" • Total models trained: {results_summary['models_trained']}") + print(f" • VAR models: {len([k for k in all_results.keys() if k.startswith('VAR')])}") + +# %% +# EXECUTION CELL - RUN COMPLETE PIPELINE +# ====================================== + +print("Advanced modeling pipeline ready!") +print("\nTo execute the complete pipeline:") +print("results = run_advanced_model_pipeline()") +print("\nOr specify custom categories:") +print("results = run_advanced_model_pipeline(['Pantry', 'Bakery', 'Dairy, Eggs & Fridge'])") +print("\nThis will run:") +print(" • ARIMAX models with external regressors") +print(" • LSTM networks for non-linear patterns") +print(" • Ensemble combinations") +print(" • VAR models for cross-category relationships") +print(" • Comprehensive evaluation and comparison") + +results = run_advanced_model_pipeline() + +# Notebook-Based Forecasting Pipeline with Direct Output +# ===================================================== + +import numpy as np +import pandas as pd +import warnings +from datetime import datetime, timedelta +from sklearn.preprocessing import MinMaxScaler +from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error +import matplotlib.pyplot as plt +import seaborn as sns +from typing import Dict, List, Tuple, Optional +warnings.filterwarnings('ignore') + +# Deep learning imports +try: + import tensorflow as tf + from tensorflow.keras.models import Sequential + from tensorflow.keras.layers import LSTM, Dense, Dropout, Input + from tensorflow.keras.optimizers import Adam + from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau + HAS_TENSORFLOW = True +except ImportError: + print("⚠ TensorFlow not available. Install with: pip install tensorflow") + HAS_TENSORFLOW = False + +# Set plotting style +plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default') +sns.set_palette("husl") + +# ======================================================== +# 1. WALK-FORWARD VALIDATION FRAMEWORK (NOTEBOOK VERSION) +# ======================================================== + +class NotebookWalkForwardValidator: + """Walk-forward validator with direct notebook output""" + + def __init__(self, min_train_size=52, test_size=4, step_size=4, sequence_length=12): + self.min_train_size = min_train_size + self.test_size = test_size + self.step_size = step_size + self.sequence_length = sequence_length + + def create_validation_splits(self, data, date_col='date'): + """Create walk-forward validation time splits""" + data = data.sort_values(date_col).reset_index(drop=True) + splits = [] + + total_weeks = len(data) + current_train_end = self.min_train_size + + while current_train_end + self.test_size <= total_weeks: + train_indices = list(range(0, current_train_end)) + test_indices = list(range(current_train_end, current_train_end + self.test_size)) + + splits.append({ + 'train_idx': train_indices, + 'test_idx': test_indices, + 'train_end_date': data.iloc[current_train_end - 1][date_col], + 'test_start_date': data.iloc[current_train_end][date_col], + 'test_end_date': data.iloc[current_train_end + self.test_size - 1][date_col] + }) + + current_train_end += self.step_size + + return splits + + def validate_lstm_model(self, data, category): + """Run walk-forward validation for LSTM models with notebook output""" + + print(f"WALK-FORWARD VALIDATION: {category}") + print("=" * 60) + + # Filter and prepare data + cat_data = data[data['category'] == category].copy() + cat_data = cat_data[cat_data['price_lag_52'].notna()] + + if len(cat_data) < self.min_train_size + self.test_size: + print(f"Insufficient data for {category}: {len(cat_data)} records") + print(f"Need at least {self.min_train_size + self.test_size} records for validation") + return None + + # Feature selection + feature_cols = [ + 'discounted_price', 'price_lag_1', 'price_lag_7', 'ma_7d', 'ma_30d', + 'promo_flag', 'has_holiday_event', 'has_seasonal_event', + 'month_sin', 'month_cos', 'volatility_4w' + ] + + # Aggregate to weekly level + weekly_data = cat_data.groupby('date')[feature_cols].mean().reset_index() + weekly_data = weekly_data.sort_values('date') + + print(f"Data Summary:") + print(f" • Total weeks: {len(weekly_data)}") + print(f" • Date range: {weekly_data['date'].min().strftime('%Y-%m-%d')} to {weekly_data['date'].max().strftime('%Y-%m-%d')}") + + # Create validation splits + splits = self.create_validation_splits(weekly_data) + + if len(splits) == 0: + print(f"Cannot create validation splits for {category}") + return None + + print(f" • Validation splits: {len(splits)}") + + # Run validation + validation_results = [] + all_predictions = [] + all_actuals = [] + all_dates = [] + + successful_splits = 0 + + for i, split in enumerate(splits): + print(f"\nSplit {i+1}/{len(splits)}: {split['test_start_date'].strftime('%Y-%m-%d')} to {split['test_end_date'].strftime('%Y-%m-%d')}") + + # Prepare data + train_data = weekly_data.iloc[split['train_idx']] + test_data = weekly_data.iloc[split['test_idx']] + + # Check minimum training data requirement + if len(train_data) < self.sequence_length + 10: + print(f" Insufficient training data: {len(train_data)} weeks") + continue + + # Scale data + scaler = MinMaxScaler() + train_scaled = scaler.fit_transform(train_data[feature_cols]) + test_scaled = scaler.transform(test_data[feature_cols]) + + # Create sequences + X_train, y_train = self._create_sequences(train_scaled, self.sequence_length) + X_test, y_test = self._create_sequences(test_scaled, self.sequence_length) + + if len(X_train) < 10: + print(f" Insufficient training sequences: {len(X_train)}") + continue + + if len(X_test) == 0: + print(f" No test sequences available") + continue + + # Build and train LSTM + model = self._build_lstm_model(X_train.shape[1:]) + + # Train model + try: + history = model.fit( + X_train, y_train, + epochs=30, + batch_size=min(16, len(X_train)//2), + validation_split=0.2 if len(X_train) > 5 else 0, + callbacks=[ + EarlyStopping(patience=5, restore_best_weights=True), + ReduceLROnPlateau(patience=3, factor=0.5) + ], + verbose=0 + ) + except Exception as e: + print(f" Training failed: {str(e)}") + continue + + # Generate predictions + try: + y_pred_scaled = model.predict(X_test, verbose=0) + + # Inverse transform + dummy_array = np.zeros((len(y_pred_scaled), len(feature_cols))) + dummy_array[:, 0] = y_pred_scaled.flatten() + y_pred_actual = scaler.inverse_transform(dummy_array)[:, 0] + + dummy_array_actual = np.zeros((len(y_test), len(feature_cols))) + dummy_array_actual[:, 0] = y_test + y_test_actual = scaler.inverse_transform(dummy_array_actual)[:, 0] + + # Calculate metrics + metrics = self._calculate_metrics(y_test_actual, y_pred_actual) + market_conditions = self._identify_market_conditions(test_data) + + # Store results + validation_results.append({ + 'split_id': i, + 'test_period': f"{split['test_start_date'].strftime('%Y-%m-%d')} to {split['test_end_date'].strftime('%Y-%m-%d')}", + 'metrics': metrics, + 'market_conditions': market_conditions + }) + + # Store for plotting + test_dates = test_data['date'].iloc[self.sequence_length:].values + if len(test_dates) == len(y_pred_actual): + all_predictions.extend(y_pred_actual) + all_actuals.extend(y_test_actual) + all_dates.extend(test_dates) + + successful_splits += 1 + print(f" MAE: {metrics['MAE']:.3f} | RMSE: {metrics['RMSE']:.3f} | MAPE: {metrics['MAPE']:.1f}%") + print(f" Conditions: {', '.join(market_conditions)}") + + except Exception as e: + print(f" Prediction failed: {str(e)}") + continue + + if successful_splits == 0: + print(f"\nNo successful validation splits for {category}") + return None + + print(f"\nSuccessful validation splits: {successful_splits}/{len(splits)}") + + # Display summary results + self._display_validation_summary(validation_results, category) + + # Create validation plots if we have data + if len(all_predictions) > 0 and len(all_actuals) > 0: + self._create_validation_plots(all_dates, all_actuals, all_predictions, category) + else: + print(f" Insufficient data for validation plots") + + return validation_results + + def _create_sequences(self, data, sequence_length): + """Create sequences for LSTM training""" + X, y = [], [] + for i in range(sequence_length, len(data)): + X.append(data[i-sequence_length:i]) + y.append(data[i, 0]) + return np.array(X), np.array(y) + + def _build_lstm_model(self, input_shape): + """Build LSTM model""" + model = Sequential([ + Input(shape=input_shape), + LSTM(50, return_sequences=True, dropout=0.2), + LSTM(25, dropout=0.2), + Dense(16, activation='relu'), + Dropout(0.2), + Dense(1) + ]) + + model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae']) + return model + + def _calculate_metrics(self, y_true, y_pred): + """Calculate standard forecasting metrics""" + return { + 'MAE': mean_absolute_error(y_true, y_pred), + 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), + 'MAPE': mean_absolute_percentage_error(y_true, y_pred) * 100, + 'R²': 1 - (np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2)) + } + + def _identify_market_conditions(self, test_data): + """Identify market conditions during test period""" + conditions = [] + + if 'promo_flag' in test_data.columns: + promo_rate = test_data['promo_flag'].mean() + if promo_rate > 0.3: + conditions.append('high_promo') + elif promo_rate > 0.1: + conditions.append('medium_promo') + else: + conditions.append('low_promo') + + if 'has_holiday_event' in test_data.columns and test_data['has_holiday_event'].any(): + conditions.append('holiday') + + if 'season' in test_data.columns: + season = test_data['season'].mode()[0] if len(test_data['season'].mode()) > 0 else 'unknown' + conditions.append(season) + + return conditions + + def _display_validation_summary(self, results, category): + """Display validation summary statistics""" + if not results: + return + + print(f"\n VALIDATION SUMMARY: {category}") + print("=" * 50) + + mae_scores = [r['metrics']['MAE'] for r in results] + mape_scores = [r['metrics']['MAPE'] for r in results] + rmse_scores = [r['metrics']['RMSE'] for r in results] + + print(f" Performance Metrics:") + print(f" • Number of validation periods: {len(results)}") + print(f" • Average MAE: {np.mean(mae_scores):.3f} ± {np.std(mae_scores):.3f}") + print(f" • Average MAPE: {np.mean(mape_scores):.1f}% ± {np.std(mape_scores):.1f}%") + print(f" • Average RMSE: {np.mean(rmse_scores):.3f} ± {np.std(rmse_scores):.3f}") + print(f" • Best MAE: {min(mae_scores):.3f}") + print(f" • Worst MAE: {max(mae_scores):.3f}") + + # Performance by market conditions + condition_performance = {} + for result in results: + for condition in result['market_conditions']: + if condition not in condition_performance: + condition_performance[condition] = [] + condition_performance[condition].append(result['metrics']['MAE']) + + print(f"\n Performance by Market Conditions:") + for condition, maes in condition_performance.items(): + print(f" • {condition}: {np.mean(maes):.3f} MAE ({len(maes)} periods)") + + def _create_validation_plots(self, dates, actuals, predictions, category): + """Create validation visualization plots""" + + # Check if we have enough data for plotting + if len(dates) == 0 or len(actuals) == 0 or len(predictions) == 0: + print(f" ⚠ Insufficient data for plotting {category} validation results") + return + + # Ensure all arrays have the same length + min_length = min(len(dates), len(actuals), len(predictions)) + dates = dates[:min_length] + actuals = actuals[:min_length] + predictions = predictions[:min_length] + + # Convert to numpy arrays for safety + actuals = np.array(actuals) + predictions = np.array(predictions) + + fig, axes = plt.subplots(2, 2, figsize=(15, 10)) + fig.suptitle(f'Walk-Forward Validation Results: {category}', fontsize=16, fontweight='bold') + + # Convert dates to pandas datetime if needed + if len(dates) > 0: + dates = pd.to_datetime(dates) + + # Plot 1: Actual vs Predicted over time + axes[0, 0].plot(dates, actuals, label='Actual', alpha=0.7, linewidth=2) + axes[0, 0].plot(dates, predictions, label='Predicted', alpha=0.7, linewidth=2) + axes[0, 0].set_title('Actual vs Predicted Prices Over Time') + axes[0, 0].set_xlabel('Date') + axes[0, 0].set_ylabel('Price ($)') + axes[0, 0].legend() + axes[0, 0].grid(True, alpha=0.3) + + # Plot 2: Prediction errors + errors = predictions - actuals + axes[0, 1].plot(dates, errors, color='red', alpha=0.7) + axes[0, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5) + axes[0, 1].set_title('Prediction Errors Over Time') + axes[0, 1].set_xlabel('Date') + axes[0, 1].set_ylabel('Error ($)') + axes[0, 1].grid(True, alpha=0.3) + + # Plot 3: Scatter plot + axes[1, 0].scatter(actuals, predictions, alpha=0.6) + + # Safe min/max calculation + if len(actuals) > 0 and len(predictions) > 0: + min_val = min(np.min(actuals), np.min(predictions)) + max_val = max(np.max(actuals), np.max(predictions)) + axes[1, 0].plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8) + + axes[1, 0].set_title('Actual vs Predicted (Scatter)') + axes[1, 0].set_xlabel('Actual Price ($)') + axes[1, 0].set_ylabel('Predicted Price ($)') + axes[1, 0].grid(True, alpha=0.3) + + # Plot 4: Error distribution + if len(errors) > 0: + axes[1, 1].hist(errors, bins=min(20, len(errors)), alpha=0.7, edgecolor='black') + axes[1, 1].axvline(x=np.mean(errors), color='red', linestyle='--', + label=f'Mean: {np.mean(errors):.3f}') + axes[1, 1].legend() + + axes[1, 1].set_title('Distribution of Prediction Errors') + axes[1, 1].set_xlabel('Error ($)') + axes[1, 1].set_ylabel('Frequency') + axes[1, 1].grid(True, alpha=0.3) + + plt.tight_layout() + plt.show() + +# ======================================================== +# 2. PRODUCTION FORECASTING PIPELINE (NOTEBOOK VERSION) +# ======================================================== + +class NotebookForecastingPipeline: + """Production forecasting pipeline with notebook output""" + + def __init__(self, forecast_horizon=12): + self.forecast_horizon = forecast_horizon + self.models = {} + self.scalers = {} + + def train_and_forecast(self, data, category): + """Train model and generate forecasts with notebook output""" + + print(f" PRODUCTION FORECASTING: {category}") + print("=" * 60) + + # Prepare data + cat_data = data[data['category'] == category].copy() + cat_data = cat_data[cat_data['price_lag_52'].notna()] + + if len(cat_data) < 104: + print(f" Insufficient data for {category}: {len(cat_data)} records") + return None + + # Feature selection + feature_cols = [ + 'discounted_price', 'price_lag_1', 'price_lag_7', 'ma_7d', 'ma_30d', + 'promo_flag', 'has_holiday_event', 'has_seasonal_event', + 'month_sin', 'month_cos', 'volatility_4w' + ] + + # Aggregate to weekly level + weekly_data = cat_data.groupby('date')[feature_cols].mean().reset_index() + weekly_data = weekly_data.sort_values('date') + + print(f" Training Data Summary:") + print(f" • Total weeks: {len(weekly_data)}") + print(f" • Date range: {weekly_data['date'].min().strftime('%Y-%m-%d')} to {weekly_data['date'].max().strftime('%Y-%m-%d')}") + print(f" • Average price: ${weekly_data['discounted_price'].mean():.2f}") + + # Scale data + scaler = MinMaxScaler() + scaled_data = scaler.fit_transform(weekly_data[feature_cols]) + + # Create sequences + sequence_length = 12 + X, y = [], [] + for i in range(sequence_length, len(scaled_data)): + X.append(scaled_data[i-sequence_length:i]) + y.append(scaled_data[i, 0]) + + X, y = np.array(X), np.array(y) + + # Train/validation split + split_idx = int(len(X) * 0.8) + X_train, X_val = X[:split_idx], X[split_idx:] + y_train, y_val = y[:split_idx], y[split_idx:] + + print(f" • Training sequences: {len(X_train)}") + print(f" • Validation sequences: {len(X_val)}") + + # Build model + model = Sequential([ + Input(shape=(sequence_length, len(feature_cols))), + LSTM(64, return_sequences=True, dropout=0.2), + LSTM(32, dropout=0.2), + Dense(16, activation='relu'), + Dropout(0.2), + Dense(1) + ]) + + model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae']) + + print(f"\n🔧 Training Model...") + + # Train model + history = model.fit( + X_train, y_train, + validation_data=(X_val, y_val), + epochs=50, + batch_size=16, + callbacks=[ + EarlyStopping(patience=10, restore_best_weights=True), + ReduceLROnPlateau(patience=5, factor=0.5) + ], + verbose=0 + ) + + # Evaluate model + val_loss = min(history.history['val_loss']) + val_mae = min(history.history['val_mae']) + + print(f" Training Complete:") + print(f" • Epochs trained: {len(history.history['loss'])}") + print(f" • Final validation loss: {val_loss:.6f}") + print(f" • Final validation MAE: {val_mae:.6f}") + + # Generate forecasts + print(f"\n Generating {self.forecast_horizon}-Week Forecast...") + + forecasts = [] + confidence_intervals = [] + current_sequence = scaled_data[-sequence_length:] + + for week in range(self.forecast_horizon): + # Predict next week + X_pred = current_sequence.reshape(1, sequence_length, len(feature_cols)) + y_pred_scaled = model.predict(X_pred, verbose=0)[0, 0] + + # Inverse transform prediction + dummy_array = np.zeros((1, len(feature_cols))) + dummy_array[0, 0] = y_pred_scaled + y_pred_actual = scaler.inverse_transform(dummy_array)[0, 0] + + forecasts.append(y_pred_actual) + + # Generate confidence interval (Monte Carlo dropout) + mc_predictions = [] + for _ in range(100): + mc_pred = model.predict(X_pred, verbose=0)[0, 0] + dummy_mc = np.zeros((1, len(feature_cols))) + dummy_mc[0, 0] = mc_pred + mc_actual = scaler.inverse_transform(dummy_mc)[0, 0] + mc_predictions.append(mc_actual) + + ci_lower = np.percentile(mc_predictions, 5) + ci_upper = np.percentile(mc_predictions, 95) + confidence_intervals.append((ci_lower, ci_upper)) + + # Update sequence for next prediction + next_week_features = current_sequence[-1].copy() + next_week_features[0] = y_pred_scaled + current_sequence = np.vstack([current_sequence[1:], next_week_features]) + + # Create forecast dates + last_date = weekly_data['date'].max() + forecast_dates = pd.date_range(start=last_date + timedelta(weeks=1), + periods=self.forecast_horizon, freq='W') + + # Display forecast results + self._display_forecast_results(weekly_data, forecasts, forecast_dates, confidence_intervals, category) + + # Create forecast visualization + self._create_forecast_plots(weekly_data, forecasts, forecast_dates, confidence_intervals, category) + + # Store models for reuse + self.models[category] = model + self.scalers[category] = scaler + + return { + 'category': category, + 'forecasts': forecasts, + 'forecast_dates': forecast_dates.tolist(), + 'confidence_intervals': confidence_intervals, + 'last_actual_price': weekly_data['discounted_price'].iloc[-1], + 'training_metrics': {'val_loss': val_loss, 'val_mae': val_mae} + } + + def _display_forecast_results(self, weekly_data, forecasts, forecast_dates, confidence_intervals, category): + """Display forecast results in notebook""" + + print(f"\n FORECAST RESULTS: {category}") + print("=" * 50) + + last_price = weekly_data['discounted_price'].iloc[-1] + + print(f" Price Forecasts:") + print(f" • Current price: ${last_price:.2f}") + print(f" • 4-week forecast: ${forecasts[3]:.2f}") + print(f" • 8-week forecast: ${forecasts[7]:.2f}") + print(f" • 12-week forecast: ${forecasts[11]:.2f}") + + print(f"\n Confidence Intervals (90%):") + print(f" • 4-week: ${confidence_intervals[3][0]:.2f} - ${confidence_intervals[3][1]:.2f}") + print(f" • 8-week: ${confidence_intervals[7][0]:.2f} - ${confidence_intervals[7][1]:.2f}") + print(f" • 12-week: ${confidence_intervals[11][0]:.2f} - ${confidence_intervals[11][1]:.2f}") + + # Price change analysis + change_4w = ((forecasts[3] - last_price) / last_price) * 100 + change_12w = ((forecasts[11] - last_price) / last_price) * 100 + + print(f"\n Expected Price Changes:") + print(f" • 4-week change: {change_4w:+.1f}%") + print(f" • 12-week change: {change_12w:+.1f}%") + + # Trend analysis + if change_12w > 5: + trend = " Rising trend" + elif change_12w < -5: + trend = " Declining trend" + else: + trend = " Stable trend" + + print(f" • Overall trend: {trend}") + + def _create_forecast_plots(self, weekly_data, forecasts, forecast_dates, confidence_intervals, category): + """Create forecast visualization plots""" + + fig, axes = plt.subplots(2, 2, figsize=(16, 12)) + fig.suptitle(f'Production Forecasting Results: {category}', fontsize=16, fontweight='bold') + + # Prepare data for plotting + historical_dates = weekly_data['date'].tail(52) # Last year + historical_prices = weekly_data['discounted_price'].tail(52) + + # Plot 1: Main forecast chart + axes[0, 0].plot(historical_dates, historical_prices, label='Historical', linewidth=2, alpha=0.8) + axes[0, 0].plot(forecast_dates, forecasts, label='Forecast', linewidth=2, color='red') + + # Add confidence intervals + ci_lower, ci_upper = zip(*confidence_intervals) + axes[0, 0].fill_between(forecast_dates, ci_lower, ci_upper, alpha=0.3, color='red', label='90% CI') + + axes[0, 0].set_title('Price Forecast with Confidence Intervals') + axes[0, 0].set_xlabel('Date') + axes[0, 0].set_ylabel('Price ($)') + axes[0, 0].legend() + axes[0, 0].grid(True, alpha=0.3) + + # Plot 2: Forecast horizon detail + all_dates = list(historical_dates.tail(12)) + list(forecast_dates) + all_prices = list(historical_prices.tail(12)) + forecasts + + axes[0, 1].plot(all_dates[:12], all_prices[:12], 'b-', linewidth=2, label='Recent History') + axes[0, 1].plot(all_dates[11:], all_prices[11:], 'r-', linewidth=2, label='Forecast') + axes[0, 1].scatter([all_dates[11]], [all_prices[11]], color='black', s=50, zorder=5) + + axes[0, 1].set_title('Recent History vs Forecast') + axes[0, 1].set_xlabel('Date') + axes[0, 1].set_ylabel('Price ($)') + axes[0, 1].legend() + axes[0, 1].grid(True, alpha=0.3) + + # Plot 3: Forecast uncertainty + weeks = np.arange(1, len(forecasts) + 1) + uncertainty = np.array(ci_upper) - np.array(ci_lower) + + axes[1, 0].plot(weeks, uncertainty, marker='o', linewidth=2) + axes[1, 0].set_title('Forecast Uncertainty Over Time') + axes[1, 0].set_xlabel('Weeks Ahead') + axes[1, 0].set_ylabel('Uncertainty Range ($)') + axes[1, 0].grid(True, alpha=0.3) + + # Plot 4: Price change distribution + price_changes = [] + current_price = historical_prices.iloc[-1] + + for forecast in forecasts: + change = ((forecast - current_price) / current_price) * 100 + price_changes.append(change) + + axes[1, 1].bar(weeks, price_changes, alpha=0.7) + axes[1, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5) + axes[1, 1].set_title('Expected Price Changes (%)') + axes[1, 1].set_xlabel('Weeks Ahead') + axes[1, 1].set_ylabel('Price Change (%)') + axes[1, 1].grid(True, alpha=0.3) + + plt.tight_layout() + plt.show() + +# ======================================================== +# 3. MAIN EXECUTION FUNCTIONS +# ======================================================== + +def run_notebook_validation(data, categories=None): + """Run validation framework with notebook output""" + + print(" WALK-FORWARD VALIDATION FRAMEWORK") + print("=" * 80) + + if categories is None: + categories = ['Pantry', 'Dairy, Eggs & Fridge', 'Health & Beauty'] + + validator = NotebookWalkForwardValidator() + all_results = {} + + for category in categories: + print(f"\n") + results = validator.validate_lstm_model(data, category) + if results: + all_results[category] = results + + # Overall comparison + if all_results: + print(f"\n OVERALL VALIDATION COMPARISON") + print("=" * 60) + + comparison_data = [] + for category, results in all_results.items(): + mae_scores = [r['metrics']['MAE'] for r in results] + mape_scores = [r['metrics']['MAPE'] for r in results] + + comparison_data.append({ + 'Category': category, + 'Avg MAE': np.mean(mae_scores), + 'Std MAE': np.std(mae_scores), + 'Avg MAPE': np.mean(mape_scores), + 'Best MAE': min(mae_scores), + 'Periods': len(results) + }) + + comparison_df = pd.DataFrame(comparison_data) + comparison_df = comparison_df.sort_values('Avg MAE') + + print("\n Performance Ranking:") + for i, row in comparison_df.iterrows(): + print(f" {comparison_df.index.get_loc(i)+1}. {row['Category']:<20} | " + f"MAE: {row['Avg MAE']:.3f} ± {row['Std MAE']:.3f} | " + f"MAPE: {row['Avg MAPE']:.1f}%") + + return all_results + +def run_notebook_forecasting(data, categories=None): + """Run production forecasting with notebook output""" + + print("PRODUCTION FORECASTING PIPELINE") + print("=" * 80) + + if categories is None: + categories = ['Pantry', 'Dairy, Eggs & Fridge', 'Health & Beauty'] + + pipeline = NotebookForecastingPipeline(forecast_horizon=12) + all_forecasts = {} + + for category in categories: + print(f"\n") + forecast_result = pipeline.train_and_forecast(data, category) + if forecast_result: + all_forecasts[category] = forecast_result + + # Summary comparison + if all_forecasts: + print(f"\nFORECAST SUMMARY COMPARISON") + print("=" * 60) + + summary_data = [] + for category, result in all_forecasts.items(): + current_price = result['last_actual_price'] + forecast_4w = result['forecasts'][3] + forecast_12w = result['forecasts'][11] + + change_4w = ((forecast_4w - current_price) / current_price) * 100 + change_12w = ((forecast_12w - current_price) / current_price) * 100 + + summary_data.append({ + 'Category': category, + 'Current ($)': current_price, + '4W Forecast ($)': forecast_4w, + '12W Forecast ($)': forecast_12w, + '4W Change (%)': change_4w, + '12W Change (%)': change_12w + }) + + summary_df = pd.DataFrame(summary_data) + + print("\nPrice Forecast Summary:") + for _, row in summary_df.iterrows(): + print(f" {row['Category']:<20} | " + f"Current: ${row['Current ($)']:.2f} | " + f"4W: ${row['4W Forecast ($)']:.2f} ({row['4W Change (%)']:+.1f}%) | " + f"12W: ${row['12W Forecast ($)']:.2f} ({row['12W Change (%)']:+.1f}%)") + + return all_forecasts + +def run_complete_notebook_pipeline(data_path='grocery_data_with_features.csv'): + """Execute complete pipeline with notebook output""" + + print("COMPLETE FORECASTING PIPELINE EXECUTION") + print("=" * 80) + + # Load data + print("Loading data...") + try: + data = pd.read_csv(data_path) + data['date'] = pd.to_datetime(data['date']) + print(f"Loaded {len(data):,} records from {data_path}") + except FileNotFoundError: + print(f"Data file not found: {data_path}") + print("Please ensure 'grocery_data_with_features.csv' exists in your current directory") + return None + + # Define categories + categories = ['Pantry', 'Dairy, Eggs & Fridge', 'Health & Beauty'] + print(f"Target categories: {categories}") + + # Phase 1: Validation + print(f"\nPHASE 1: WALK-FORWARD VALIDATION") + print("=" * 60) + + validation_results = run_notebook_validation(data, categories) + + # Phase 2: Production Forecasting + print(f"\nPHASE 2: PRODUCTION FORECASTING") + print("=" * 60) + + forecast_results = run_notebook_forecasting(data, categories) + + # Phase 3: Final Analysis + print(f"\nPHASE 3: ANALYSIS AND RECOMMENDATIONS") + print("=" * 60) + + if validation_results and forecast_results: + + print("\nModel Performance Analysis:") + print("-" * 30) + + for category in categories: + if category in validation_results: + val_results = validation_results[category] + mae_scores = [r['metrics']['MAE'] for r in val_results] + avg_mae = np.mean(mae_scores) + + if category in forecast_results: + print(f"{category}:") + print(f" Validation MAE: {avg_mae:.3f}") + print(f" Validation periods: {len(val_results)}") + + # Forecast confidence assessment + forecast_result = forecast_results[category] + ci_width_4w = forecast_result['confidence_intervals'][3][1] - forecast_result['confidence_intervals'][3][0] + ci_width_12w = forecast_result['confidence_intervals'][11][1] - forecast_result['confidence_intervals'][11][0] + + print(f" 4W CI width: ${ci_width_4w:.2f}") + print(f" 12W CI width: ${ci_width_12w:.2f}") + + print("\nKey Insights:") + print("-" * 15) + + # Find best performing category + best_category = None + best_mae = float('inf') + + for category in categories: + if category in validation_results: + val_results = validation_results[category] + mae_scores = [r['metrics']['MAE'] for r in val_results] + avg_mae = np.mean(mae_scores) + + if avg_mae < best_mae: + best_mae = avg_mae + best_category = category + + if best_category: + print(f" Best performing model: {best_category} (MAE: {best_mae:.3f})") + + # Forecast trend analysis + rising_categories = [] + stable_categories = [] + declining_categories = [] + + for category, result in forecast_results.items(): + current_price = result['last_actual_price'] + forecast_12w = result['forecasts'][11] + change_12w = ((forecast_12w - current_price) / current_price) * 100 + + if change_12w > 3: + rising_categories.append(f"{category} (+{change_12w:.1f}%)") + elif change_12w < -3: + declining_categories.append(f"{category} ({change_12w:.1f}%)") + else: + stable_categories.append(f"{category} ({change_12w:+.1f}%)") + + if rising_categories: + print(f" Rising prices (12W): {', '.join(rising_categories)}") + if declining_categories: + print(f" Declining prices (12W): {', '.join(declining_categories)}") + if stable_categories: + print(f" Stable prices (12W): {', '.join(stable_categories)}") + + print("\nRecommendations:") + print("-" * 16) + + recommendations = [] + + # Performance-based recommendations + for category in categories: + if category in validation_results: + val_results = validation_results[category] + mae_scores = [r['metrics']['MAE'] for r in val_results] + avg_mae = np.mean(mae_scores) + std_mae = np.std(mae_scores) + + if avg_mae > 1.0: + recommendations.append(f"Consider ensemble methods for {category} (high error rate)") + + if std_mae > avg_mae * 0.5: + recommendations.append(f"Investigate {category} prediction variability") + + # Forecast-based recommendations + for category, result in forecast_results.items(): + ci_width_12w = result['confidence_intervals'][11][1] - result['confidence_intervals'][11][0] + avg_price = result['last_actual_price'] + + if ci_width_12w / avg_price > 0.3: # CI width > 30% of price + recommendations.append(f"High uncertainty in {category} 12-week forecasts") + + if not recommendations: + recommendations.append("All models show good performance and reasonable uncertainty") + + for i, rec in enumerate(recommendations, 1): + print(f" {i}. {rec}") + + print(f"\nPIPELINE EXECUTION COMPLETE") + print("=" * 80) + print("All results displayed above. Models are ready for production use.") + + return { + 'validation_results': validation_results, + 'forecast_results': forecast_results, + 'execution_timestamp': datetime.now().isoformat() + } + +# ======================================================== +# 4. MONITORING AND ALERTS (NOTEBOOK VERSION) +# ======================================================== + +class NotebookForecastMonitor: + """Monitor forecast performance with notebook alerts""" + + def __init__(self): + self.alert_thresholds = { + 'mae_threshold': 2.0, + 'mape_threshold': 15.0, + 'bias_threshold': 1.0 + } + + def check_model_performance(self, validation_results): + """Check model performance and display alerts""" + + print("MODEL PERFORMANCE MONITORING") + print("=" * 40) + + alerts = [] + + for category, results in validation_results.items(): + if not results: + continue + + mae_scores = [r['metrics']['MAE'] for r in results] + mape_scores = [r['metrics']['MAPE'] for r in results] + + avg_mae = np.mean(mae_scores) + avg_mape = np.mean(mape_scores) + + # Check thresholds + if avg_mae > self.alert_thresholds['mae_threshold']: + severity = 'HIGH' if avg_mae > self.alert_thresholds['mae_threshold'] * 2 else 'MEDIUM' + alerts.append({ + 'category': category, + 'type': 'High MAE', + 'value': avg_mae, + 'severity': severity + }) + + if avg_mape > self.alert_thresholds['mape_threshold']: + severity = 'HIGH' if avg_mape > self.alert_thresholds['mape_threshold'] * 2 else 'MEDIUM' + alerts.append({ + 'category': category, + 'type': 'High MAPE', + 'value': avg_mape, + 'severity': severity + }) + + if alerts: + print("PERFORMANCE ALERTS:") + for alert in alerts: + print(f" [{alert['severity']}] {alert['category']}: {alert['type']} = {alert['value']:.3f}") + else: + print("No performance alerts. All models within acceptable thresholds.") + + return alerts + + def check_data_quality(self, data): + """Check data quality and display alerts""" + + print("\nDATA QUALITY MONITORING") + print("=" * 30) + + alerts = [] + + # Missing data check + missing_pct = data.isnull().sum().sum() / (len(data) * len(data.columns)) * 100 + if missing_pct > 5: + alerts.append(f"High missing data rate: {missing_pct:.1f}%") + + # Price outlier check + if 'discounted_price' in data.columns: + q99 = data['discounted_price'].quantile(0.99) + q01 = data['discounted_price'].quantile(0.01) + outlier_pct = ((data['discounted_price'] > q99) | + (data['discounted_price'] < q01)).mean() * 100 + + if outlier_pct > 3: + alerts.append(f"High price outlier rate: {outlier_pct:.1f}%") + + # Recent data check + if 'date' in data.columns: + latest_date = data['date'].max() + days_old = (datetime.now() - latest_date).days + + if days_old > 14: + alerts.append(f"Data is {days_old} days old") + + if alerts: + print("DATA QUALITY ALERTS:") + for alert in alerts: + print(f" WARNING: {alert}") + else: + print("Data quality checks passed.") + + return alerts + +# ======================================================== +# 5. EXECUTION EXAMPLE +# ======================================================== + +def execute_pipeline_example(): + """Example of how to run the complete pipeline""" + + print("FORECASTING PIPELINE EXECUTION EXAMPLE") + print("=" * 50) + print("\nTo run the complete pipeline, use:") + print("results = run_complete_notebook_pipeline()") + print("\nOr run components separately:") + print("validation_results = run_notebook_validation(data)") + print("forecast_results = run_notebook_forecasting(data)") + + print("\nThis will display:") + print(" - Walk-forward validation results with plots") + print(" - Production forecasts with confidence intervals") + print(" - Performance comparisons and recommendations") + print(" - All outputs directly in the notebook") + +# ======================================================== +# 6. READY TO EXECUTE +# ======================================================== + +print("NOTEBOOK FORECASTING PIPELINE LOADED") +print("=" * 40) +print("Ready to execute! Run:") +print("results = run_complete_notebook_pipeline()") +print("\nOr if you want to test individual components:") +print("validation_results = run_notebook_validation(df_with_features)") +print("forecast_results = run_notebook_forecasting(df_with_features)") + +# Execute the complete pipeline +results = run_complete_notebook_pipeline() +