From 12fb4e4ded3cef248b3d66a21911b65f28e094cb Mon Sep 17 00:00:00 2001
From: raj-4041 <rajp97057@gmail.com>
Date: Tue, 23 Sep 2025 15:28:33 +1000
Subject: [PATCH] time seires deal pred

Signed-off-by: raj-4041 <rajp97057@gmail.com>
---
 .../Raj's_Time_series_deal_pred.py            | 4600 +++++++++++++++++
 1 file changed, 4600 insertions(+)
 create mode 100644 ML/Price-Prediction/Raj's_Time_series_deal_pred.py

diff --git a/ML/Price-Prediction/Raj's_Time_series_deal_pred.py b/ML/Price-Prediction/Raj's_Time_series_deal_pred.py
new file mode 100644
index 00000000..e67fb4d2
--- /dev/null
+++ b/ML/Price-Prediction/Raj's_Time_series_deal_pred.py
@@ -0,0 +1,4600 @@
+# -*- coding: utf-8 -*-
+"""Untitled.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1eKrTU7EnUkhJFvzTfmnu2bsxvN5X2wtC
+"""
+
+# %%
+import pandas as pd
+import numpy as np
+import re
+from datetime import datetime, timedelta
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from statsmodels.tsa.seasonal import seasonal_decompose
+import warnings
+warnings.filterwarnings('ignore')
+
+# %%
+# CONFIGURATION AND DATA STRUCTURES
+# =================================
+
+# Function to parse weights to grams
+def parse_to_grams(s):
+    """Convert weight strings to grams for standardization"""
+    if pd.isna(s):
+        return None
+
+    s = str(s).strip().lower()
+    match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', s)
+
+    if not match:
+        if 'pack' in s or 'each' in s:
+            return 1
+        return None
+
+    num_str = match.group(1)
+    unit = match.group(2).lower()
+
+    try:
+        num = float(num_str)
+    except ValueError:
+        if 'pack' in s or 'each' in s:
+            return 1
+        return None
+
+    # Unit conversions
+    if unit in ['g', 'gram', 'grams']:
+        return num
+    elif unit in ['kg', 'kilogram', 'kilograms']:
+        return num * 1000
+    elif unit in ['ml', 'milliliter', 'milliliters']:
+        return num  # assume 1g/ml
+    elif unit in ['l', 'liter', 'liters']:
+        return num * 1000
+    elif unit in ['pack', 'each']:
+        return num
+    else:
+        return None
+
+# %%
+# HOLIDAY AND EVENT DEFINITIONS
+# ============================
+
+# Major holidays affecting grocery pricing
+holidays = {
+    'New Year': [
+        datetime(2023, 1, 1), datetime(2024, 1, 1), datetime(2025, 1, 1)
+    ],
+    'Australia Day': [
+        datetime(2023, 1, 26), datetime(2024, 1, 26), datetime(2025, 1, 26)
+    ],
+    'Easter': [
+        datetime(2023, 4, 7), datetime(2023, 4, 10),
+        datetime(2024, 3, 29), datetime(2024, 4, 1),
+        datetime(2025, 4, 18), datetime(2025, 4, 21)
+    ],
+    'Anzac Day': [
+        datetime(2023, 4, 25), datetime(2024, 4, 25), datetime(2025, 4, 25)
+    ],
+    'Christmas': [
+        datetime(2023, 12, 25), datetime(2023, 12, 26),
+        datetime(2024, 12, 25), datetime(2024, 12, 26),
+        datetime(2025, 12, 25), datetime(2025, 12, 26)
+    ],
+    'Chinese New Year': [
+        datetime(2023, 2, 22), datetime(2024, 2, 10), datetime(2025, 1, 29)
+    ],
+    'Mother Day': [
+        datetime(2023, 5, 14), datetime(2024, 5, 12), datetime(2025, 5, 11)
+    ],
+    'Father Day': [
+        datetime(2023, 9, 3), datetime(2024, 9, 1), datetime(2025, 9, 7)
+    ],
+    'Labour Day': [
+        datetime(2023, 10, 2), datetime(2024, 10, 7), datetime(2025, 10, 6)
+    ],
+    'King Birthday': [
+        datetime(2023, 6, 12), datetime(2024, 6, 10), datetime(2025, 6, 9)
+    ],
+    'Melbourne Cup': [
+        datetime(2023, 11, 7), datetime(2024, 11, 5), datetime(2025, 11, 4)
+    ],
+    'Diwali': [
+        datetime(2023, 11, 12), datetime(2024, 11, 1), datetime(2025, 10, 20)
+    ],
+    'Halloween': [
+        datetime(2023, 10, 31), datetime(2024, 10, 31), datetime(2025, 10, 31)
+    ],
+    'Valentine Day': [
+        datetime(2023, 2, 14), datetime(2024, 2, 14), datetime(2025, 2, 14)
+    ],
+    'Back to School': [
+        datetime(2023, 1, 30), datetime(2023, 7, 24),
+        datetime(2024, 1, 29), datetime(2024, 7, 22),
+        datetime(2025, 1, 27), datetime(2025, 7, 21)
+    ],
+    'School Holidays': [
+        datetime(2023, 4, 10), datetime(2023, 7, 1), datetime(2023, 9, 25), datetime(2023, 12, 18),
+        datetime(2024, 3, 28), datetime(2024, 7, 6), datetime(2024, 9, 23), datetime(2024, 12, 16),
+        datetime(2025, 4, 14), datetime(2025, 7, 5), datetime(2025, 9, 22), datetime(2025, 12, 15)
+    ]
+}
+
+# Supply chain disruption events
+supply_chain_events = {
+    'Suez Canal': [datetime(2023, 3, 25), datetime(2023, 3, 29)],
+    'Shipping Delays': [datetime(2023, 8, 15), datetime(2023, 8, 25)],
+    'Port Strikes': [datetime(2024, 2, 10), datetime(2024, 2, 20)],
+    'Fuel Price Spike': [datetime(2024, 9, 1), datetime(2024, 9, 15)],
+    'Container Shortage': [datetime(2023, 11, 1), datetime(2023, 11, 30)],
+    'COVID Lockdown': [datetime(2023, 5, 1), datetime(2023, 5, 14)],
+    'Truck Driver Strike': [datetime(2024, 6, 15), datetime(2024, 6, 25)],
+    'Factory Fire': [datetime(2024, 11, 10), datetime(2024, 11, 20)]
+}
+
+# Weather events affecting agriculture and supply
+weather_events = {
+    'Flood Queensland': [datetime(2023, 2, 15), datetime(2023, 3, 15)],
+    'Drought NSW': [datetime(2023, 6, 1), datetime(2023, 8, 31)],
+    'Cyclone WA': [datetime(2024, 1, 20), datetime(2024, 2, 5)],
+    'Heatwave Victoria': [datetime(2024, 12, 15), datetime(2025, 1, 15)],
+    'Frost Tasmania': [datetime(2023, 9, 1), datetime(2023, 9, 30)],
+    'Bushfire NSW': [datetime(2024, 10, 1), datetime(2024, 10, 31)],
+    'Heavy Rain Melbourne': [datetime(2024, 3, 10), datetime(2024, 3, 25)],
+    'Extreme Heat Adelaide': [datetime(2025, 2, 1), datetime(2025, 2, 14)]
+}
+
+# Disease/pest outbreaks
+disease_events = {
+    'Avian Flu': [datetime(2023, 7, 1), datetime(2023, 9, 30)],
+    'Foot and Mouth Scare': [datetime(2024, 4, 1), datetime(2024, 4, 30)],
+    'White Spot Prawns': [datetime(2023, 10, 15), datetime(2023, 11, 15)],
+    'Banana Disease': [datetime(2024, 8, 1), datetime(2024, 9, 15)],
+    'Citrus Canker': [datetime(2025, 3, 1), datetime(2025, 4, 15)]
+}
+
+# %%
+# PRICING MULTIPLIERS BY CATEGORY
+# ===============================
+
+# FIXED: Realistic category multipliers (reduced by 50-70%)
+category_multipliers = {
+    'Meat & Seafood': {
+        'Christmas': 1.3, 'Easter': 1.2, 'Summer': 1.1, 'Winter': 0.95,
+        'Avian Flu': 1.25, 'Foot and Mouth Scare': 1.15, 'White Spot Prawns': 1.2,
+        'Drought NSW': 1.1, 'Flood Queensland': 1.08,
+        'supply_chain_base': 1.15, 'weather_base': 1.1, 'disease_base': 1.2,
+        'shock_prob': 0.08, 'shock_var': 0.15
+    },
+    'Fruit & Vegetables': {
+        'Summer': 0.85, 'Winter': 1.2, 'Christmas': 1.1, 'Chinese New Year': 1.15,
+        'Flood Queensland': 1.4, 'Drought NSW': 1.3, 'Cyclone WA': 1.25,
+        'Heatwave Victoria': 1.15, 'Frost Tasmania': 1.2, 'Heavy Rain Melbourne': 1.1,
+        'Banana Disease': 1.5, 'Citrus Canker': 1.4,
+        'supply_chain_base': 1.08, 'weather_base': 1.3, 'disease_base': 1.4,
+        'shock_prob': 0.12, 'shock_var': 0.25
+    },
+    'Dairy, Eggs & Fridge': {
+        'Christmas': 1.15, 'Easter': 1.1, 'Winter': 1.05, 'Back to School': 1.08,
+        'Avian Flu': 1.3, 'Drought NSW': 1.15, 'Extreme Heat Adelaide': 1.1,
+        'supply_chain_base': 1.1, 'weather_base': 1.08, 'disease_base': 1.25,
+        'shock_prob': 0.08, 'shock_var': 0.15
+    },
+    'Bakery': {
+        'Christmas': 1.2, 'Easter': 1.1, 'School Holidays': 1.08, 'Winter': 1.03,
+        'supply_chain_base': 1.05, 'weather_base': 1.03,
+        'shock_prob': 0.05, 'shock_var': 0.08
+    },
+    'Pantry': {
+        'COVID Lockdown': 1.25, 'School Holidays': 1.1, 'Back to School': 1.15,
+        'Container Shortage': 1.1, 'Shipping Delays': 1.08,
+        'supply_chain_base': 1.08, 'weather_base': 1.03,
+        'shock_prob': 0.05, 'shock_var': 0.1
+    },
+    'Health & Beauty': {
+        'New Year': 1.25, 'Valentine Day': 1.1, 'Winter': 1.08,
+        'supply_chain_base': 1.03, 'shock_prob': 0.03, 'shock_var': 0.05
+    },
+    'Drinks': {
+        'Summer': 1.3, 'Christmas': 1.35, 'Australia Day': 1.15, 'Melbourne Cup': 1.1,
+        'Heatwave Victoria': 1.2, 'Extreme Heat Adelaide': 1.18,
+        'supply_chain_base': 1.08, 'weather_base': 1.1,
+        'shock_prob': 0.05, 'shock_var': 0.1
+    },
+    'Frozen': {
+        'Summer': 1.5, 'Heatwave Victoria': 1.3, 'Extreme Heat Adelaide': 1.25,
+        'Christmas': 1.3, 'School Holidays': 1.2,
+        'supply_chain_base': 1.1, 'weather_base': 1.25,
+        'shock_prob': 0.08, 'shock_var': 0.15
+    },
+    'Deli': {
+        'Christmas': 1.4, 'Easter': 1.25, 'Melbourne Cup': 1.18, 'King Birthday': 1.1,
+        'supply_chain_base': 1.08, 'shock_prob': 0.06, 'shock_var': 0.12
+    },
+    'Household': {
+        'Back to School': 1.15, 'Spring': 1.1, 'COVID Lockdown': 1.2,
+        'supply_chain_base': 1.05, 'shock_prob': 0.04, 'shock_var': 0.08
+    }
+}
+
+# %%
+# SUBCATEGORY-SPECIFIC SEASONAL EFFECTS
+# ====================================
+
+# FIXED: Realistic subcategory seasonal effects (reduced)
+subcat_seasonal_effects = {
+    'Fruit': {'Summer': 0.7, 'Winter': 1.4},  # Reduced from 0.4/2.5
+    'Vegetables (Leafy/Salad)': {'Summer': 1.15, 'Winter': 0.95},  # Reduced from 1.4/0.9
+    'Vegetables (Root/Onion/Garlic)': {'Winter': 0.9, 'Summer': 1.08},  # Reduced from 0.8/1.2
+    'Vegetables (Fruiting)': {'Summer': 0.8, 'Winter': 1.2},  # Reduced from 0.6/1.5
+    'Lamb': {'Easter': 1.4, 'Christmas': 1.25},  # Reduced from 2.0/1.6
+    'Turkey': {'Christmas': 1.8, 'Easter': 1.05},  # Reduced from 3.0/1.2
+    'Fish': {'Christmas': 1.5, 'Easter': 1.3, 'Summer': 1.1},  # Reduced from 2.4/1.8/1.3
+    'Prawns': {'Christmas': 2.0, 'Chinese New Year': 1.6},  # Reduced from 3.5/2.2
+    'Ice Cream': {'Summer': 2.2, 'Heatwave Victoria': 1.8, 'Winter': 0.5},  # Reduced from 4.0/3.0/0.3
+    'Frozen Vegetables': {'Winter': 1.1, 'COVID Lockdown': 1.2}  # Reduced from 1.3/1.5
+}
+
+# %%
+# SUBCATEGORY PROMOTION PROBABILITIES
+# ==================================
+
+# Dictionary of subcat apply probabilities
+subcat_apply_probs = {
+    'Pork': 0.4, 'Beef': 0.4, 'Chicken': 0.4, 'Prawns': 0.35, 'Pantry/Other': 0.15,
+    'Lamb': 0.35, 'Mixed Meat': 0.3, 'Salmon': 0.35, 'Fish': 0.35, 'Turkey': 0.3,
+    'Tuna': 0.3, 'Kangaroo': 0.3, 'Seafood': 0.35, 'Plant-Based': 0.25, 'Veal': 0.3,
+    'Duck': 0.3, 'Trout': 0.35, 'Mussels': 0.35, 'Venison': 0.3, 'Wallaby': 0.3,
+    'Crab': 0.35, 'Fruit': 0.25, 'Other Items (F&V Section)': 0.2,
+    'Vegetables (Fruiting)': 0.25, 'Vegetables (Root/Onion/Garlic)': 0.2,
+    'Vegetables (Stem/Flower/Pod)': 0.2, 'Vegetables (Leafy/Salad)': 0.25,
+    'Mushrooms': 0.25, 'Herbs/Sprouts': 0.2, 'Value-Added Produce': 0.2,
+    'Nuts/Seeds/Dried Fruit': 0.25, 'Yoghurt Specialty': 0.25, 'Butter Standard': 0.2,
+    'Milk Specialty': 0.2, 'Cheese Standard': 0.2, 'Cheese Specialty': 0.25,
+    'Butter Specialty': 0.2, 'Yoghurt Standard': 0.2, 'Outsider': 0.15,
+    'Cream Standard': 0.2, 'Milk Standard': 0.15, 'Eggs Standard': 0.2,
+    'Bread Loaves': 0.15, 'Wraps & Flatbreads': 0.2, 'Cakes & Slices': 0.3,
+    'Rolls & Buns': 0.2, 'Savoury Bakery Items': 0.25, 'Sourdough & Artisan Breads': 0.25,
+    'Sweet Pastries & Donuts': 0.3, 'Biscuits & Cookies': 0.3, 'Pancakes, Waffles & Crepes': 0.25,
+    'Muffins & Cupcakes': 0.3, 'Seafood (Processed/Cooked)': 0.35, 'Bacon': 0.35,
+    'Ham': 0.35, 'Platters/Kits': 0.3, 'Chicken (Processed/Cooked)': 0.35, 'Pantry': 0.15,
+    'Salami/Pepperoni/Chorizo': 0.25, 'Crackers/Breadsticks': 0.25, 'Antipasto/Olives/Pickles': 0.25,
+    'Turkey (Processed/Cooked)': 0.3, 'Beef (Processed/Cooked)': 0.35, 'Frankfurts/Sausages': 0.3,
+    'Pork (Processed/Cooked)': 0.35, 'Cheese': 0.2, 'Bakery': 0.25, 'Confectionery': 0.3,
+    'Other Deli': 0.2, 'Prepared Meals': 0.25, 'Dips/Pate': 0.25, 'Snacks (Sweet)': 0.3,
+    'Canned Goods': 0.15, 'Meal Kits/Bases/Instant Meals': 0.25, 'Breakfast Cereals': 0.25,
+    'Pasta/Rice/Noodles/Grains': 0.15, 'Snacks (Savoury)': 0.3, 'Beverages (Shelf-Stable)': 0.25,
+    'Spreads/Oils/Condiments': 0.2, 'Baking Mixes': 0.2, 'Baking Ingredients': 0.2,
+    'Other Pantry Items': 0.15, 'Juice/Smoothie': 0.25, 'Functional/Health Drink': 0.25,
+    'Other Drinks': 0.2, 'Soft Drink/Mixer': 0.3, 'Water': 0.15, 'Milk': 0.15,
+    'Non-Drink Item': 0.15, 'Tea': 0.2, 'Coffee': 0.25, 'Alcoholic Beverages (Low/No Alc)': 0.3,
+    'Frozen Chips': 0.3, 'Ice Cream': 0.3, 'Frozen Desserts': 0.3, 'Frozen Meat': 0.3,
+    'Frozen Poultry': 0.3, 'Frozen Fruits': 0.25, 'Frozen Vegetables': 0.25,
+    'Frozen Pastry': 0.25, 'Frozen Meals': 0.25, 'Frozen Seafood': 0.3, 'Other Frozen': 0.2,
+    'Stationery': 0.1, 'Dishwashing': 0.15, 'Bags': 0.15, 'Laundry Care': 0.2,
+    'Kitchenware & Food Storage': 0.2, 'Paper Products': 0.15, 'Cleaning Solutions & Wipes': 0.2,
+    'Cleaning Tools & Accessories': 0.15, 'Home Maintenance & General': 0.15,
+    'Air Care & Pest Control': 0.2, 'Vitamins & Supplements': 0.2, 'Skincare': 0.25,
+    'Wash Products': 0.2, 'First Aid & Wellness': 0.15, 'Health & Medicines': 0.15,
+    'Feminine & Incontinence Care': 0.15, 'Oral Care': 0.2, 'Deodorants & Antiperspirants': 0.25,
+    'Hair Care': 0.2, 'Shaving & Hair Removal': 0.2, 'First Aid & Wellness Accessories': 0.15,
+    'Deodorants & Body Sprays': 0.25, 'Medicines & Health Treatments': 0.15,
+    "Shaving & Men's Grooming": 0.2,
+}
+
+# %%
+# CORE PRICING AND ANALYSIS FUNCTIONS
+# ===================================
+
+def get_fortnight_col(date):
+    """Get fortnight column name for discount lookup"""
+    month_abbr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+    month_idx = date.month - 1
+    month_name = month_abbr[month_idx]
+    half = '01' if date.day <= 15 else '02'
+    return f"{month_name}-{half}"
+
+def get_season(date):
+    """Get season for Southern Hemisphere"""
+    month = date.month
+    if month in [12, 1, 2]:
+        return 'Summer'
+    elif month in [3, 4, 5]:
+        return 'Autumn'
+    elif month in [6, 7, 8]:
+        return 'Winter'
+    else:
+        return 'Spring'
+
+def is_event_active(date, event_dates, days_impact=7):
+    """Check if date is within impact period of an event"""
+    for event_date in event_dates:
+        if isinstance(event_date, list):
+            if len(event_date) == 2:
+                start_date, end_date = event_date
+                if start_date <= date <= end_date + timedelta(days=days_impact):
+                    return True
+        else:
+            if abs((date - event_date).days) <= days_impact:
+                return True
+    return False
+
+def apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct):
+    """Apply comprehensive pricing factors including all real-world events - FIXED VERSION"""
+    multipliers = category_multipliers.get(category, {})
+    subcat_effects = subcat_seasonal_effects.get(subcat, {})
+
+    mult = 1.0
+    event_description = []
+
+    # Seasonal effects
+    season = get_season(date)
+    if season in multipliers:
+        seasonal_mult = multipliers[season]
+        mult *= seasonal_mult
+        if seasonal_mult != 1.0:
+            event_description.append(f"{season} effect: {seasonal_mult:.2f}x")
+
+    # Subcat seasonal effects (override category if more specific)
+    if season in subcat_effects:
+        subcat_mult = subcat_effects[season]
+        mult = mult / multipliers.get(season, 1.0) * subcat_mult
+        event_description.append(f"{subcat} {season}: {subcat_mult:.2f}x")
+
+    # Holiday effects - REDUCED PROBABILITY
+    for holiday, dates in holidays.items():
+        if is_event_active(date, dates, days_impact=14):
+            # Only apply if random chance (reduce from 100% to 40%)
+            if np.random.rand() < 0.4:
+                if holiday in multipliers:
+                    holiday_mult = multipliers[holiday]
+                    mult *= holiday_mult
+                    event_description.append(f"{holiday}: {holiday_mult:.2f}x")
+                elif holiday in subcat_effects:
+                    holiday_mult = subcat_effects[holiday]
+                    mult *= holiday_mult
+                    event_description.append(f"{subcat} {holiday}: {holiday_mult:.2f}x")
+
+    # Supply chain disruptions - REDUCED PROBABILITY
+    for event, dates in supply_chain_events.items():
+        if is_event_active(date, dates, days_impact=21):
+            # Only apply if random chance (reduce from 100% to 20%)
+            if np.random.rand() < 0.2:
+                supply_mult = multipliers.get('supply_chain_base', 1.08)
+                if event in multipliers:
+                    supply_mult = multipliers[event]
+                mult *= supply_mult
+                event_description.append(f"Supply chain ({event}): {supply_mult:.2f}x")
+
+    # Weather events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY
+    for event, dates in weather_events.items():
+        if is_event_active(date, dates, days_impact=30):
+            # Only apply to relevant categories and reduce probability
+            weather_categories = ['Fruit & Vegetables', 'Meat & Seafood', 'Dairy, Eggs & Fridge']
+            if category in weather_categories and np.random.rand() < 0.25:
+                weather_mult = multipliers.get('weather_base', 1.1)
+                if event in multipliers:
+                    weather_mult = multipliers[event]
+                mult *= weather_mult
+                event_description.append(f"Weather ({event}): {weather_mult:.2f}x")
+
+    # Disease/pest events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY
+    for event, dates in disease_events.items():
+        if is_event_active(date, dates, days_impact=60):
+            # Only apply to relevant categories
+            disease_categories = {
+                'Avian Flu': ['Meat & Seafood', 'Dairy, Eggs & Fridge'],
+                'Foot and Mouth Scare': ['Meat & Seafood', 'Dairy, Eggs & Fridge'],
+                'White Spot Prawns': ['Meat & Seafood'],
+                'Banana Disease': ['Fruit & Vegetables'],
+                'Citrus Canker': ['Fruit & Vegetables']
+            }
+            relevant_categories = disease_categories.get(event, [])
+            if category in relevant_categories and np.random.rand() < 0.3:
+                disease_mult = multipliers.get('disease_base', 1.2)
+                if event in multipliers:
+                    disease_mult = multipliers[event]
+                mult *= disease_mult
+                event_description.append(f"Disease ({event}): {disease_mult:.2f}x")
+
+    # Random market shocks - REDUCED PROBABILITY
+    shock_prob = multipliers.get('shock_prob', 0.05)
+    if np.random.rand() < shock_prob:
+        shock = np.random.normal(0, multipliers.get('shock_var', 0.1))
+        shock_mult = 1 + shock
+        mult *= shock_mult
+        if abs(shock) > 0.05:  # Only log significant shocks
+            event_description.append(f"Market shock: {shock_mult:.2f}x")
+
+    # Inflation trend (1.5-3% annually) - REDUCED
+    current_date = datetime(2025, 8, 19)
+    years_back = (current_date - date).days / 365.25
+    inflation_rate = np.random.uniform(0.015, 0.03)  # Reduced from 0.02-0.04
+    trend_mult = (1 + inflation_rate) ** years_back
+    mult *= trend_mult
+
+    # Competitor effects (random promotions) - REDUCED PROBABILITY
+    if np.random.rand() < 0.05:  # Reduced from 0.1
+        competitor_effect = np.random.uniform(0.92, 0.98)  # Reduced impact
+        mult *= competitor_effect
+        event_description.append(f"Competitor pressure: {competitor_effect:.2f}x")
+
+    # CRITICAL FIX: Enforce realistic bounds
+    mult = np.clip(mult, 0.5, 2.5)  # Prevent extreme multipliers
+
+    # Calculate final prices
+    normal_price = base_price * mult
+
+    # CRITICAL FIX: Ensure minimum price
+    normal_price = max(normal_price, base_price * 0.3)  # Never less than 30% of base
+
+    # Discount adjustments with realistic bounds
+    if mult > 1.2:
+        discount_multiplier = 1.1  # Reduced from 1.2
+    elif mult < 0.9:
+        discount_multiplier = 0.9  # Reduced from 0.8
+    else:
+        discount_multiplier = 1.0
+
+    adjusted_discount = base_discount_pct * discount_multiplier
+    adjusted_discount = max(0, min(0.7, adjusted_discount))  # Max 70% discount
+
+    return normal_price, adjusted_discount, event_description
+
+# %%
+# HISTORICAL DATA GENERATION
+# =========================
+
+def generate_initial_discounts():
+    """Generate initial discount data (first code block functionality)"""
+
+    # Load the main dataset
+    df = pd.read_csv('/content/coles_new.csv')
+    df.columns = df.columns.str.strip()
+
+    # Load the fortnightly discounts dataset
+    discounts_df = pd.read_csv('/content/fortnightly_discounts.csv')
+    discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False)
+
+    # Clean numeric columns - fix the data type issue
+    discount_columns = [col for col in discounts_df.columns if '-' in col]  # Fortnight cols like Jan-01
+    for col in discount_columns:
+        # Clean any problematic values like "25 25" or other string issues
+        discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False)
+        discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0)
+
+    discounts_dict = discounts_df.set_index('Subcategory').to_dict('index')
+
+    # Set the current date
+    current_date = datetime(2025, 8, 5)
+    df['date'] = current_date
+
+    # Parse weights to grams
+    df['grams'] = df['weights'].apply(parse_to_grams)
+
+    # Compute price per 100g
+    price_col = 'item_price'
+    if price_col in df.columns:
+        df['price_per_100g'] = df.apply(
+            lambda row: (row[price_col] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None,
+            axis=1
+        )
+
+    # Get current fortnight column
+    fortnight_col = get_fortnight_col(current_date)
+
+    # Initialize discount columns
+    df['promo_flag'] = 0
+    df['discount_pct'] = 0.0
+    df['discounted_price'] = df[price_col]
+
+    # Apply discounts per item
+    for idx, row in df.iterrows():
+        subcat = row['subcat']
+        rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {}))
+        base_discount_pct = rule.get(fortnight_col, 0) / 100
+        apply_prob = subcat_apply_probs.get(subcat, 0.2)
+        promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob)
+
+        if promo_flag:
+            discount_pct = base_discount_pct + np.random.normal(0, 0.05)
+            discount_pct = np.clip(discount_pct, 0, 0.9)
+            df.at[idx, 'promo_flag'] = 1
+            df.at[idx, 'discount_pct'] = discount_pct
+            df.at[idx, 'discounted_price'] = row[price_col] * (1 - discount_pct)
+
+    # Save to new CSV
+    df.to_csv('coles_with_discounts.csv', index=False)
+    print("Initial discount data generated and saved to 'coles_with_discounts.csv'")
+
+    return df
+
+def generate_enhanced_historical_data():
+    """Main function to generate enhanced historical data"""
+
+    # Load the main dataset
+    df = pd.read_csv('/content/coles_new.csv')
+    df.columns = df.columns.str.strip()
+
+    # Load the fortnightly discounts dataset
+    discounts_df = pd.read_csv('/content/fortnightly_discounts.csv')
+    discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False)
+
+    # Clean numeric columns - fix the data type issue
+    discount_columns = [col for col in discounts_df.columns if '-' in col]  # Fortnight cols like Jan-01
+    for col in discount_columns:
+        # Clean any problematic values like "25 25" or other string issues
+        discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False)
+        discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0)
+
+    discounts_dict = discounts_df.set_index('Subcategory').to_dict('index')
+
+    # Set the current date
+    current_date = datetime(2025, 8, 19)
+
+    # Generate historical dates (104 weeks = 2 years)
+    dates = pd.date_range(end=current_date, periods=104, freq='W')
+
+    # Parse weights and calculate price per 100g
+    df['grams'] = df['weights'].apply(parse_to_grams)
+    df['price_per_100g'] = df.apply(
+        lambda row: (row['item_price'] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None,
+        axis=1
+    )
+
+    print(f"Generating enhanced historical data for {len(df)} products over {len(dates)} weeks...")
+
+    all_historical_data = []
+
+    for idx, (_, row) in enumerate(df.iterrows()):
+        if idx % 1000 == 0:
+            print(f"Processing product {idx+1}/{len(df)}")
+
+        subcat = row['subcat']
+        category = row['category']
+        base_price = row['item_price']
+
+        # Get discount rule for this subcat
+        rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {}))
+        apply_prob = subcat_apply_probs.get(subcat, 0.2)
+
+        for date in dates:
+            # Get base discount for this fortnight
+            fortnight = get_fortnight_col(date)
+            base_discount_pct = float(rule.get(fortnight, 0)) / 100
+
+            # Apply enhanced factors
+            normal_price, discount_pct, events = apply_enhanced_factors(
+                date, category, subcat, base_price, base_discount_pct
+            )
+
+            # Determine if promotion is active
+            promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob)
+            if promo_flag:
+                discount_pct = discount_pct + np.random.normal(0, 0.05)
+                discount_pct = np.clip(discount_pct, 0, 0.9)
+                final_discount = discount_pct
+            else:
+                final_discount = 0
+
+            discounted_price = normal_price * (1 - final_discount)
+
+            # Store the record
+            record = {
+                'date': date,
+                'product_code': row['product_code'],
+                'category': category,
+                'essential_flag': row['essential_flag'],
+                'item_name': row['item_name'],
+                'subcat': subcat,
+                'weights': row['weights'],
+                'unit_price': row.get('unit_price', 0),
+                'brand_name': row['brand_name'],
+                'grams': row['grams'],
+                'price_per_100g': (discounted_price / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None,
+                'normal_price': round(normal_price, 2),
+                'promo_flag': int(promo_flag),
+                'discount_pct': round(final_discount, 4),
+                'discounted_price': round(discounted_price, 2),
+                'price_multiplier': round(normal_price / base_price, 3),
+                'events_active': '; '.join(events) if events else 'None'
+            }
+
+            all_historical_data.append(record)
+
+    # Convert to DataFrame
+    historical_df = pd.DataFrame(all_historical_data)
+
+    print(f"Generated {len(historical_df):,} historical records")
+    return historical_df
+
+# %%
+# SIMPLIFIED CATEGORY-BASED GENERATION (ALTERNATIVE APPROACH)
+# ==========================================================
+
+def generate_synthetic_for_category(category_df, dates):
+    """Generate synthetic data for a specific category"""
+    synthetic_dfs = []
+
+    for _, row in category_df.iterrows():
+        subcat = row['subcat']
+        category = row['category']
+        base_price = row['item_price']
+        rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {}))
+        apply_prob = subcat_apply_probs.get(subcat, 0.2)
+
+        item_df = pd.DataFrame({'date': dates})
+        item_df['product_code'] = row['product_code']
+        item_df['item_name'] = row['item_name']
+        item_df['brand_name'] = row['brand_name']
+        item_df['weights'] = row['weights']
+        item_df['grams'] = row['grams']
+        item_df['price_per_100g'] = row['price_per_100g']
+        item_df['subcat'] = subcat
+        item_df['category'] = category
+
+        item_df['normal_price'] = np.nan
+        item_df['discount_pct'] = 0.0
+        item_df['promo_flag'] = 0
+        item_df['discounted_price'] = np.nan
+
+        for i, date in enumerate(dates):
+            fortnight = get_fortnight_col(date)
+            base_discount_pct = float(rule.get(fortnight, 0)) / 100
+            normal_price, discount_pct = apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct)[:2]
+
+            promo_flag = (discount_pct > 0) and (np.random.rand() < apply_prob)
+            item_df.at[i, 'normal_price'] = normal_price
+            item_df.at[i, 'promo_flag'] = 1 if promo_flag else 0
+            item_df.at[i, 'discount_pct'] = discount_pct if promo_flag else 0
+            item_df.at[i, 'discounted_price'] = normal_price * (1 - item_df.at[i, 'discount_pct'])
+
+        synthetic_dfs.append(item_df)
+    return pd.concat(synthetic_dfs, ignore_index=True)
+
+# %%
+# EXPLORATORY DATA ANALYSIS FUNCTIONS
+# ===================================
+
+def create_essential_eda(df):
+    """Create 5 essential EDA plots"""
+
+    print("Creating Essential EDA Plots...")
+
+    # Setup modern style
+    sns.set_style("whitegrid")
+    plt.rcParams['figure.figsize'] = (12, 6)
+    plt.rcParams['font.size'] = 12
+
+    # 1. Time Series Analysis - Average Prices by Category
+    monthly_avg = df.groupby([df['date'].dt.to_period('M'), 'category'])['discounted_price'].mean().reset_index()
+    monthly_avg['date'] = monthly_avg['date'].dt.to_timestamp()
+
+    fig1 = px.line(
+        monthly_avg,
+        x='date',
+        y='discounted_price',
+        color='category',
+        title='1. Average Monthly Prices by Category Over Time',
+        labels={'discounted_price': 'Average Price (AUD)', 'date': 'Date'}
+    )
+    fig1.update_layout(height=600, hovermode='x unified')
+    fig1.show()
+
+    # 2. Event Impact Analysis
+    event_impact = df.copy()
+    event_impact['has_events'] = event_impact['events_active'] != 'None'
+    comparison = event_impact.groupby(['category', 'has_events'])['price_multiplier'].mean().reset_index()
+    comparison['event_status'] = comparison['has_events'].map({True: 'With Events', False: 'Normal'})
+
+    fig2 = px.bar(
+        comparison,
+        x='category',
+        y='price_multiplier',
+        color='event_status',
+        title='2. Price Impact: Normal vs Event Periods',
+        labels={'price_multiplier': 'Average Price Multiplier'},
+        barmode='group'
+    )
+    fig2.update_layout(height=600)
+    fig2.update_xaxes(tickangle=45)
+    fig2.show()
+
+    # 3. Seasonal Patterns Heatmap
+    df['month'] = df['date'].dt.month
+    seasonal_data = df.groupby(['category', 'month'])['discounted_price'].mean().reset_index()
+    seasonal_pivot = seasonal_data.pivot(index='category', columns='month', values='discounted_price')
+
+    fig3 = go.Figure(data=go.Heatmap(
+        z=seasonal_pivot.values,
+        x=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
+        y=seasonal_pivot.index,
+        colorscale='RdYlBu_r',
+        text=np.round(seasonal_pivot.values, 2),
+        texttemplate="%{text}",
+        textfont={"size": 10},
+        hoverongaps=False
+    ))
+    fig3.update_layout(
+        title='3. Seasonal Price Patterns by Category',
+        height=600,
+        xaxis_title="Month",
+        yaxis_title="Category"
+    )
+    fig3.show()
+
+    # 4. Price Volatility by Category
+    volatility = df.groupby(['category', df['date'].dt.to_period('M')])['discounted_price'].agg(['mean', 'std']).reset_index()
+    volatility['date'] = volatility['date'].dt.to_timestamp()
+    volatility['cv'] = volatility['std'] / volatility['mean'] * 100  # Coefficient of variation
+
+    fig4 = px.line(
+        volatility,
+        x='date',
+        y='cv',
+        color='category',
+        title='4. Price Volatility (Coefficient of Variation) by Category',
+        labels={'cv': 'Coefficient of Variation (%)', 'date': 'Date'}
+    )
+    fig4.update_layout(height=600)
+    fig4.show()
+
+    # 5. Distribution Overview
+    fig5 = make_subplots(
+        rows=2, cols=2,
+        subplot_titles=('Price Distribution', 'Discount Distribution', 'Promotion Rate by Category', 'Price Multiplier Distribution'),
+        specs=[[{'type': 'histogram'}, {'type': 'histogram'}],
+               [{'type': 'bar'}, {'type': 'histogram'}]]
+    )
+
+    # Price distribution
+    fig5.add_trace(
+        go.Histogram(x=df['discounted_price'], nbinsx=50, name='Price Distribution'),
+        row=1, col=1
+    )
+
+    # Discount distribution (only when discount > 0)
+    discount_data = df[df['discount_pct'] > 0]['discount_pct'] * 100
+    fig5.add_trace(
+        go.Histogram(x=discount_data, nbinsx=30, name='Discount Distribution'),
+        row=1, col=2
+    )
+
+    # Promotion rates by category
+    promo_rates = df.groupby('category')['promo_flag'].mean() * 100
+    fig5.add_trace(
+        go.Bar(x=promo_rates.index, y=promo_rates.values, name='Promotion Rate %'),
+        row=2, col=1
+    )
+
+    # Price multiplier distribution
+    fig5.add_trace(
+        go.Histogram(x=df['price_multiplier'], nbinsx=50, name='Price Multiplier'),
+        row=2, col=2
+    )
+
+    fig5.update_layout(
+        title="5. Data Distribution Overview",
+        height=800,
+        showlegend=False
+    )
+    fig5.update_xaxes(tickangle=45, row=2, col=1)
+    fig5.show()
+
+    # Summary Statistics
+    print("\nSUMMARY STATISTICS:")
+    print("="*50)
+
+    for category in df['category'].unique():
+        cat_data = df[df['category'] == category]
+        print(f"\n{category}:")
+        print(f"  • Records: {len(cat_data):,}")
+        print(f"  • Avg Price: ${cat_data['discounted_price'].mean():.2f}")
+        print(f"  • Price Range: ${cat_data['discounted_price'].min():.2f} - ${cat_data['discounted_price'].max():.2f}")
+        print(f"  • Promotion Rate: {cat_data['promo_flag'].mean()*100:.1f}%")
+        print(f"  • Avg Discount: {cat_data['discount_pct'].mean()*100:.1f}%")
+        print(f"  • Avg Price Multiplier: {cat_data['price_multiplier'].mean():.2f}x")
+
+def analyze_time_series_decomposition(df, category='Meat & Seafood'):
+    """Perform time series decomposition for a specific category"""
+
+    print(f"\nTime Series Decomposition Analysis for {category}")
+
+    # Aggregate data by date for the category
+    ts_data = df[df['category'] == category].groupby('date')['discounted_price'].mean()
+    ts_data = ts_data.asfreq('W', method='ffill')
+
+    if len(ts_data) >= 52:  # Need at least 1 year
+        try:
+            decomposition = seasonal_decompose(ts_data, model='multiplicative', period=52)
+
+            fig = make_subplots(
+                rows=4, cols=1,
+                subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'),
+                vertical_spacing=0.08
+            )
+
+            fig.add_trace(go.Scatter(x=ts_data.index, y=ts_data.values,
+                                   mode='lines', name='Original'), row=1, col=1)
+            fig.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend.values,
+                                   mode='lines', name='Trend'), row=2, col=1)
+            fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal.values,
+                                   mode='lines', name='Seasonal'), row=3, col=1)
+            fig.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid.values,
+                                   mode='lines', name='Residual'), row=4, col=1)
+
+            fig.update_layout(
+                title=f'Time Series Decomposition - {category}',
+                height=1000,
+                showlegend=False
+            )
+            fig.show()
+
+            print(f"Decomposition completed for {category}")
+            return decomposition
+
+        except Exception as e:
+            print(f"Decomposition failed: {e}")
+            return None
+    else:
+        print(f"Insufficient data for decomposition ({len(ts_data)} weeks)")
+        return None
+
+# %%
+# MAIN EXECUTION FUNCTIONS
+# ========================
+
+def run_complete_analysis():
+    """Run the complete enhanced historical data generation and analysis"""
+
+    print("Starting Enhanced Historical Data Generation")
+    print("="*60)
+
+    # Generate enhanced historical data
+    historical_df = generate_enhanced_historical_data()
+
+    # Save to CSV
+    output_file = 'enhanced_historical_data.csv'
+    historical_df.to_csv(output_file, index=False)
+    print(f"Saved to {output_file}")
+
+    # Create essential EDA plots
+    print("\nCreating Essential EDA Plots...")
+    create_essential_eda(historical_df)
+
+    # Event analysis summary
+    event_records = historical_df[historical_df['events_active'] != 'None']
+    if len(event_records) > 0:
+        print(f"\nEvent Impact Summary:")
+        print(f"   • {len(event_records):,} records affected by events ({len(event_records)/len(historical_df)*100:.1f}%)")
+
+        # Count events
+        all_events = []
+        for events in event_records['events_active']:
+            all_events.extend([e.split(':')[0].strip() for e in events.split(';')])
+
+        from collections import Counter
+        event_counts = Counter(all_events)
+        print("\nTop 10 Most Frequent Events:")
+        for event, count in event_counts.most_common(10):
+            print(f"   • {event}: {count:,} occurrences")
+
+    print(f"\nAnalysis Complete!")
+    print(f"Generated {len(historical_df):,} records for ARIMA/LSTM modeling")
+    print(f"Date range: {historical_df['date'].min().date()} to {historical_df['date'].max().date()}")
+
+    return historical_df
+
+def run_category_based_generation():
+    """Alternative approach: Generate data by category"""
+
+    # Load data
+    df = pd.read_csv('coles_with_discounts.csv')
+    df.columns = df.columns.str.strip()
+
+    # Load discounts dict
+    discounts_df = pd.read_csv('/content/fortnightly_discounts.csv')
+    discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False)
+
+    # Clean numeric columns - fix the data type issue
+    discount_columns = [col for col in discounts_df.columns if '-' in col]
+    for col in discount_columns:
+        # Clean any problematic values like "25 25" or other string issues
+        discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False)
+        discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0)
+
+    global discounts_dict
+    discounts_dict = discounts_df.set_index('Subcategory').to_dict('index')
+
+    # Current date and historical dates
+    current_date = datetime(2025, 8, 19)
+    dates = pd.date_range(end=current_date, periods=104, freq='W')
+
+    # Unique categories from dataset
+    unique_categories = df['category'].unique()
+
+    # Process categories
+    all_category_data = []
+    for cat in unique_categories:
+        cat_df = df[df['category'] == cat]
+        if not cat_df.empty:
+            synth_cat = generate_synthetic_for_category(cat_df, dates)
+            synth_cat.to_csv(f'synthetic_{cat.replace(" ", "_").replace("&", "and")}.csv', index=False)
+            all_category_data.append(synth_cat)
+            print(f"Generated for {cat}: {len(synth_cat)} rows")
+
+    # Combine all categories
+    full_synth = pd.concat(all_category_data, ignore_index=True)
+    full_synth.to_csv('synthetic_historical_data_v6.csv', index=False)
+
+    print(f"Complete dataset saved: {len(full_synth)} records")
+    return full_synth
+
+# %%
+# DATA INSPECTION AND EDA OUTPUT CELL
+# ===================================
+
+def inspect_and_analyze_data():
+    """
+    Complete data inspection and EDA output to see how the data is performing
+    This function will show you everything about your dataset
+    """
+
+    print("=" * 80)
+    print("GROCERY PRICE ANALYSIS - COMPLETE DATA INSPECTION")
+    print("=" * 80)
+
+    try:
+        # Try to load existing enhanced data first
+        print("\n1. LOADING DATA...")
+        try:
+            df = pd.read_csv('enhanced_historical_data.csv')
+            df['date'] = pd.to_datetime(df['date'])
+            print("✓ Loaded existing enhanced historical data")
+            data_source = "Enhanced Historical Data"
+        except FileNotFoundError:
+            try:
+                df = pd.read_csv('synthetic_historical_data_v6.csv')
+                df['date'] = pd.to_datetime(df['date'])
+                print("✓ Loaded existing synthetic data")
+                data_source = "Synthetic Historical Data"
+            except FileNotFoundError:
+                print("⚠ No existing data found. Generating new data...")
+                df = run_complete_analysis()
+                data_source = "Newly Generated Data"
+
+        print(f"Data Source: {data_source}")
+        print(f"Data Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
+
+        # 2. DATA QUALITY ASSESSMENT
+        print("\n" + "=" * 80)
+        print("2. DATA QUALITY ASSESSMENT")
+        print("=" * 80)
+
+        print(f"\nDATE RANGE:")
+        print(f"  • Start Date: {df['date'].min().strftime('%Y-%m-%d')}")
+        print(f"  • End Date: {df['date'].max().strftime('%Y-%m-%d')}")
+        print(f"  • Total Weeks: {df['date'].nunique()}")
+        print(f"  • Date Coverage: {(df['date'].max() - df['date'].min()).days} days")
+
+        print(f"\nMISSING VALUES:")
+        missing = df.isnull().sum()
+        missing_pct = (missing / len(df) * 100).round(2)
+        for col in df.columns:
+            if missing[col] > 0:
+                print(f"  • {col}: {missing[col]:,} ({missing_pct[col]}%)")
+
+        print(f"\nDATA TYPES:")
+        for col, dtype in df.dtypes.items():
+            unique_count = df[col].nunique()
+            print(f"  • {col}: {dtype} ({unique_count:,} unique values)")
+
+        print(f"\nCATEGORY BREAKDOWN:")
+        category_counts = df['category'].value_counts()
+        for cat, count in category_counts.items():
+            pct = (count / len(df) * 100)
+            print(f"  • {cat}: {count:,} records ({pct:.1f}%)")
+
+        # 3. PRICE ANALYSIS
+        print("\n" + "=" * 80)
+        print("3. PRICE ANALYSIS")
+        print("=" * 80)
+
+        print(f"\nOVERALL PRICE STATISTICS:")
+        price_stats = df['discounted_price'].describe()
+        print(f"  • Mean Price: ${price_stats['mean']:.2f}")
+        print(f"  • Median Price: ${price_stats['50%']:.2f}")
+        print(f"  • Price Range: ${price_stats['min']:.2f} - ${price_stats['max']:.2f}")
+        print(f"  • Standard Deviation: ${price_stats['std']:.2f}")
+        print(f"  • 25th Percentile: ${price_stats['25%']:.2f}")
+        print(f"  • 75th Percentile: ${price_stats['75%']:.2f}")
+
+        print(f"\nPRICE BY CATEGORY:")
+        for category in df['category'].unique():
+            cat_data = df[df['category'] == category]['discounted_price']
+            print(f"  • {category}:")
+            print(f"    - Mean: ${cat_data.mean():.2f}")
+            print(f"    - Median: ${cat_data.median():.2f}")
+            print(f"    - Range: ${cat_data.min():.2f} - ${cat_data.max():.2f}")
+
+        # 4. PROMOTION AND DISCOUNT ANALYSIS
+        print("\n" + "=" * 80)
+        print("4. PROMOTION AND DISCOUNT ANALYSIS")
+        print("=" * 80)
+
+        overall_promo_rate = df['promo_flag'].mean() * 100
+        print(f"\nOVERALL PROMOTION RATE: {overall_promo_rate:.1f}%")
+
+        promoted_items = df[df['promo_flag'] == 1]
+        if len(promoted_items) > 0:
+            avg_discount = promoted_items['discount_pct'].mean() * 100
+            print(f"AVERAGE DISCOUNT (when promoted): {avg_discount:.1f}%")
+
+            print(f"\nPROMOTION RATES BY CATEGORY:")
+            for category in df['category'].unique():
+                cat_promo_rate = df[df['category'] == category]['promo_flag'].mean() * 100
+                cat_avg_discount = df[(df['category'] == category) & (df['promo_flag'] == 1)]['discount_pct'].mean() * 100
+                print(f"  • {category}: {cat_promo_rate:.1f}% promo rate, {cat_avg_discount:.1f}% avg discount")
+
+        # 5. EVENT IMPACT ANALYSIS
+        print("\n" + "=" * 80)
+        print("5. EVENT IMPACT ANALYSIS")
+        print("=" * 80)
+
+        if 'events_active' in df.columns:
+            event_records = df[df['events_active'] != 'None']
+            event_impact_rate = len(event_records) / len(df) * 100
+            print(f"\nEVENT IMPACT RATE: {event_impact_rate:.1f}% of records affected by events")
+
+            if len(event_records) > 0:
+                # Event frequency analysis
+                all_events = []
+                for events in event_records['events_active']:
+                    if pd.notna(events) and events != 'None':
+                        all_events.extend([e.split(':')[0].strip() for e in str(events).split(';')])
+
+                if all_events:
+                    from collections import Counter
+                    event_counts = Counter(all_events)
+                    print(f"\nTOP 10 MOST FREQUENT EVENTS:")
+                    for event, count in event_counts.most_common(10):
+                        pct = (count / len(df) * 100)
+                        print(f"  • {event}: {count:,} occurrences ({pct:.2f}%)")
+
+                # Price impact of events
+                normal_prices = df[df['events_active'] == 'None']['discounted_price'].mean()
+                event_prices = event_records['discounted_price'].mean()
+                price_impact = ((event_prices - normal_prices) / normal_prices * 100)
+                print(f"\nPRICE IMPACT OF EVENTS:")
+                print(f"  • Normal periods avg price: ${normal_prices:.2f}")
+                print(f"  • Event periods avg price: ${event_prices:.2f}")
+                print(f"  • Price increase during events: {price_impact:.1f}%")
+
+        # 6. TEMPORAL PATTERNS
+        print("\n" + "=" * 80)
+        print("6. TEMPORAL PATTERNS")
+        print("=" * 80)
+
+        df['month'] = df['date'].dt.month
+        df['year'] = df['date'].dt.year
+        df['quarter'] = df['date'].dt.quarter
+
+        print(f"\nMONTHLY PRICE PATTERNS:")
+        monthly_prices = df.groupby('month')['discounted_price'].mean()
+        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
+                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+        for month, price in monthly_prices.items():
+            print(f"  • {month_names[month-1]}: ${price:.2f}")
+
+        print(f"\nQUARTERLY TRENDS:")
+        quarterly_prices = df.groupby('quarter')['discounted_price'].mean()
+        for quarter, price in quarterly_prices.items():
+            print(f"  • Q{quarter}: ${price:.2f}")
+
+        # 7. CREATE ALL VISUALIZATIONS
+        print("\n" + "=" * 80)
+        print("7. CREATING VISUALIZATIONS")
+        print("=" * 80)
+
+        create_essential_eda(df)
+
+        # 8. ADDITIONAL INSIGHTS
+        print("\n" + "=" * 80)
+        print("8. KEY INSIGHTS AND RECOMMENDATIONS")
+        print("=" * 80)
+
+        # Price volatility analysis
+        price_volatility = df.groupby('category')['discounted_price'].std().sort_values(ascending=False)
+        print(f"\nMOST VOLATILE CATEGORIES (by price std dev):")
+        for category, volatility in price_volatility.head().items():
+            print(f"  • {category}: ${volatility:.2f} std dev")
+
+        # Best promotion opportunities
+        low_promo_categories = df.groupby('category')['promo_flag'].mean().sort_values().head()
+        print(f"\nCATEGORIES WITH LOWEST PROMOTION RATES (opportunities):")
+        for category, rate in low_promo_categories.items():
+            print(f"  • {category}: {rate*100:.1f}% promotion rate")
+
+        # Seasonal opportunities
+        summer_winter_diff = df[df['month'].isin([12, 1, 2])]['discounted_price'].mean() - df[df['month'].isin([6, 7, 8])]['discounted_price'].mean()
+        print(f"\nSEASONAL PRICE DIFFERENCE:")
+        print(f"  • Summer vs Winter avg price difference: ${summer_winter_diff:.2f}")
+
+        print("\n" + "=" * 80)
+        print("9. DATA QUALITY SUMMARY")
+        print("=" * 80)
+
+        # Data completeness score
+        completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+        print(f"\nDATA COMPLETENESS: {completeness:.1f}%")
+
+        # Data distribution health
+        price_outliers = len(df[df['discounted_price'] > df['discounted_price'].quantile(0.99)])
+        print(f"PRICE OUTLIERS (>99th percentile): {price_outliers:,} records ({price_outliers/len(df)*100:.2f}%)")
+
+        # Time series regularity
+        date_gaps = df['date'].drop_duplicates().sort_values().diff().dropna()
+        regular_intervals = (date_gaps == pd.Timedelta(days=7)).mean() * 100
+        print(f"TIME SERIES REGULARITY: {regular_intervals:.1f}% regular weekly intervals")
+
+        print(f"\n" + "=" * 80)
+        print("ANALYSIS COMPLETE!")
+        print("=" * 80)
+        print(f"Dataset is ready for ARIMA/LSTM modeling")
+        print(f"Key features available: prices, promotions, events, seasonality")
+        print(f"Recommended next steps:")
+        print(f"  1. Use price_multiplier and events_active as external regressors")
+        print(f"  2. Consider category-specific models due to different volatility patterns")
+        print(f"  3. Leverage seasonal decomposition for feature engineering")
+
+        return df
+
+    except Exception as e:
+        print(f" Error during analysis: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+# %%
+# RUN COMPLETE INSPECTION AND ANALYSIS
+# ====================================
+
+# Execute this cell to see all data performance metrics and visualizations
+df_analyzed = inspect_and_analyze_data()
+
+# If you want to run time series decomposition on specific categories:
+if df_analyzed is not None and len(df_analyzed) > 0:
+    print("\n" + "="*80)
+    print("BONUS: TIME SERIES DECOMPOSITION")
+    print("="*80)
+
+    # Run decomposition for top 3 categories
+    top_categories = df_analyzed['category'].value_counts().head(3).index
+
+    for category in top_categories:
+        print(f"\nAnalyzing {category}...")
+        decomp_result = analyze_time_series_decomposition(df_analyzed, category)
+        if decomp_result is not None:
+            print(f"✓ Decomposition completed for {category}")
+        else:
+            print(f"⚠ Could not decompose {category} - insufficient data")
+
+print("\n🎉 Complete analysis finished! All outputs are displayed above.")
+
+# %%
+# EXECUTION INSTRUCTIONS
+# ======================
+
+print("Code loaded and ready to run!")
+print("\nMain execution cell above will show you:")
+print("  • Complete data quality assessment")
+print("  • Price analysis by category")
+print("  • Promotion and discount patterns")
+print("  • Event impact analysis")
+print("  • Temporal patterns and seasonality")
+print("  • All EDA visualizations")
+print("  • Key insights and recommendations")
+print("\nAlternative execution options:")
+print("1. historical_data = run_complete_analysis()")
+print("2. category_data = run_category_based_generation()")
+print("3. initial_data = generate_initial_discounts()")
+
+# Enhanced Grocery Price Analysis with Seasonal and Event-Based Modeling
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.17.1
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+
+# %%
+import pandas as pd
+import numpy as np
+import re
+from datetime import datetime, timedelta
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+from statsmodels.tsa.seasonal import seasonal_decompose
+import warnings
+warnings.filterwarnings('ignore')
+
+# %%
+# CONFIGURATION AND DATA STRUCTURES
+# =================================
+
+# Function to parse weights to grams
+def parse_to_grams(s):
+    """Convert weight strings to grams for standardization"""
+    if pd.isna(s):
+        return None
+
+    s = str(s).strip().lower()
+    match = re.search(r'(\d+\.?\d*)\s*([a-zA-Z]+)', s)
+
+    if not match:
+        if 'pack' in s or 'each' in s:
+            return 1
+        return None
+
+    num_str = match.group(1)
+    unit = match.group(2).lower()
+
+    try:
+        num = float(num_str)
+    except ValueError:
+        if 'pack' in s or 'each' in s:
+            return 1
+        return None
+
+    # Unit conversions
+    if unit in ['g', 'gram', 'grams']:
+        return num
+    elif unit in ['kg', 'kilogram', 'kilograms']:
+        return num * 1000
+    elif unit in ['ml', 'milliliter', 'milliliters']:
+        return num  # assume 1g/ml
+    elif unit in ['l', 'liter', 'liters']:
+        return num * 1000
+    elif unit in ['pack', 'each']:
+        return num
+    else:
+        return None
+
+# %%
+# HOLIDAY AND EVENT DEFINITIONS
+# ============================
+
+# Major holidays affecting grocery pricing
+holidays = {
+    'New Year': [
+        datetime(2023, 1, 1), datetime(2024, 1, 1), datetime(2025, 1, 1)
+    ],
+    'Australia Day': [
+        datetime(2023, 1, 26), datetime(2024, 1, 26), datetime(2025, 1, 26)
+    ],
+    'Easter': [
+        datetime(2023, 4, 7), datetime(2023, 4, 10),
+        datetime(2024, 3, 29), datetime(2024, 4, 1),
+        datetime(2025, 4, 18), datetime(2025, 4, 21)
+    ],
+    'Anzac Day': [
+        datetime(2023, 4, 25), datetime(2024, 4, 25), datetime(2025, 4, 25)
+    ],
+    'Christmas': [
+        datetime(2023, 12, 25), datetime(2023, 12, 26),
+        datetime(2024, 12, 25), datetime(2024, 12, 26),
+        datetime(2025, 12, 25), datetime(2025, 12, 26)
+    ],
+    'Chinese New Year': [
+        datetime(2023, 2, 22), datetime(2024, 2, 10), datetime(2025, 1, 29)
+    ],
+    'Mother Day': [
+        datetime(2023, 5, 14), datetime(2024, 5, 12), datetime(2025, 5, 11)
+    ],
+    'Father Day': [
+        datetime(2023, 9, 3), datetime(2024, 9, 1), datetime(2025, 9, 7)
+    ],
+    'Labour Day': [
+        datetime(2023, 10, 2), datetime(2024, 10, 7), datetime(2025, 10, 6)
+    ],
+    'King Birthday': [
+        datetime(2023, 6, 12), datetime(2024, 6, 10), datetime(2025, 6, 9)
+    ],
+    'Melbourne Cup': [
+        datetime(2023, 11, 7), datetime(2024, 11, 5), datetime(2025, 11, 4)
+    ],
+    'Diwali': [
+        datetime(2023, 11, 12), datetime(2024, 11, 1), datetime(2025, 10, 20)
+    ],
+    'Halloween': [
+        datetime(2023, 10, 31), datetime(2024, 10, 31), datetime(2025, 10, 31)
+    ],
+    'Valentine Day': [
+        datetime(2023, 2, 14), datetime(2024, 2, 14), datetime(2025, 2, 14)
+    ],
+    'Back to School': [
+        datetime(2023, 1, 30), datetime(2023, 7, 24),
+        datetime(2024, 1, 29), datetime(2024, 7, 22),
+        datetime(2025, 1, 27), datetime(2025, 7, 21)
+    ],
+    'School Holidays': [
+        datetime(2023, 4, 10), datetime(2023, 7, 1), datetime(2023, 9, 25), datetime(2023, 12, 18),
+        datetime(2024, 3, 28), datetime(2024, 7, 6), datetime(2024, 9, 23), datetime(2024, 12, 16),
+        datetime(2025, 4, 14), datetime(2025, 7, 5), datetime(2025, 9, 22), datetime(2025, 12, 15)
+    ]
+}
+
+# Supply chain disruption events
+supply_chain_events = {
+    'Suez Canal': [datetime(2023, 3, 25), datetime(2023, 3, 29)],
+    'Shipping Delays': [datetime(2023, 8, 15), datetime(2023, 8, 25)],
+    'Port Strikes': [datetime(2024, 2, 10), datetime(2024, 2, 20)],
+    'Fuel Price Spike': [datetime(2024, 9, 1), datetime(2024, 9, 15)],
+    'Container Shortage': [datetime(2023, 11, 1), datetime(2023, 11, 30)],
+    'COVID Lockdown': [datetime(2023, 5, 1), datetime(2023, 5, 14)],
+    'Truck Driver Strike': [datetime(2024, 6, 15), datetime(2024, 6, 25)],
+    'Factory Fire': [datetime(2024, 11, 10), datetime(2024, 11, 20)]
+}
+
+# Weather events affecting agriculture and supply
+weather_events = {
+    'Flood Queensland': [datetime(2023, 2, 15), datetime(2023, 3, 15)],
+    'Drought NSW': [datetime(2023, 6, 1), datetime(2023, 8, 31)],
+    'Cyclone WA': [datetime(2024, 1, 20), datetime(2024, 2, 5)],
+    'Heatwave Victoria': [datetime(2024, 12, 15), datetime(2025, 1, 15)],
+    'Frost Tasmania': [datetime(2023, 9, 1), datetime(2023, 9, 30)],
+    'Bushfire NSW': [datetime(2024, 10, 1), datetime(2024, 10, 31)],
+    'Heavy Rain Melbourne': [datetime(2024, 3, 10), datetime(2024, 3, 25)],
+    'Extreme Heat Adelaide': [datetime(2025, 2, 1), datetime(2025, 2, 14)]
+}
+
+# Disease/pest outbreaks
+disease_events = {
+    'Avian Flu': [datetime(2023, 7, 1), datetime(2023, 9, 30)],
+    'Foot and Mouth Scare': [datetime(2024, 4, 1), datetime(2024, 4, 30)],
+    'White Spot Prawns': [datetime(2023, 10, 15), datetime(2023, 11, 15)],
+    'Banana Disease': [datetime(2024, 8, 1), datetime(2024, 9, 15)],
+    'Citrus Canker': [datetime(2025, 3, 1), datetime(2025, 4, 15)]
+}
+
+# %%
+# PRICING MULTIPLIERS BY CATEGORY
+# ===============================
+
+# FIXED: Realistic category multipliers (reduced by 50-70%)
+category_multipliers = {
+    'Meat & Seafood': {
+        'Christmas': 1.3, 'Easter': 1.2, 'Summer': 1.1, 'Winter': 0.95,
+        'Avian Flu': 1.25, 'Foot and Mouth Scare': 1.15, 'White Spot Prawns': 1.2,
+        'Drought NSW': 1.1, 'Flood Queensland': 1.08,
+        'supply_chain_base': 1.15, 'weather_base': 1.1, 'disease_base': 1.2,
+        'shock_prob': 0.08, 'shock_var': 0.15
+    },
+    'Fruit & Vegetables': {
+        'Summer': 0.85, 'Winter': 1.2, 'Christmas': 1.1, 'Chinese New Year': 1.15,
+        'Flood Queensland': 1.4, 'Drought NSW': 1.3, 'Cyclone WA': 1.25,
+        'Heatwave Victoria': 1.15, 'Frost Tasmania': 1.2, 'Heavy Rain Melbourne': 1.1,
+        'Banana Disease': 1.5, 'Citrus Canker': 1.4,
+        'supply_chain_base': 1.08, 'weather_base': 1.3, 'disease_base': 1.4,
+        'shock_prob': 0.12, 'shock_var': 0.25
+    },
+    'Dairy, Eggs & Fridge': {
+        'Christmas': 1.15, 'Easter': 1.1, 'Winter': 1.05, 'Back to School': 1.08,
+        'Avian Flu': 1.3, 'Drought NSW': 1.15, 'Extreme Heat Adelaide': 1.1,
+        'supply_chain_base': 1.1, 'weather_base': 1.08, 'disease_base': 1.25,
+        'shock_prob': 0.08, 'shock_var': 0.15
+    },
+    'Bakery': {
+        'Christmas': 1.2, 'Easter': 1.1, 'School Holidays': 1.08, 'Winter': 1.03,
+        'supply_chain_base': 1.05, 'weather_base': 1.03,
+        'shock_prob': 0.05, 'shock_var': 0.08
+    },
+    'Pantry': {
+        'COVID Lockdown': 1.25, 'School Holidays': 1.1, 'Back to School': 1.15,
+        'Container Shortage': 1.1, 'Shipping Delays': 1.08,
+        'supply_chain_base': 1.08, 'weather_base': 1.03,
+        'shock_prob': 0.05, 'shock_var': 0.1
+    },
+    'Health & Beauty': {
+        'New Year': 1.25, 'Valentine Day': 1.1, 'Winter': 1.08,
+        'supply_chain_base': 1.03, 'shock_prob': 0.03, 'shock_var': 0.05
+    },
+    'Drinks': {
+        'Summer': 1.3, 'Christmas': 1.35, 'Australia Day': 1.15, 'Melbourne Cup': 1.1,
+        'Heatwave Victoria': 1.2, 'Extreme Heat Adelaide': 1.18,
+        'supply_chain_base': 1.08, 'weather_base': 1.1,
+        'shock_prob': 0.05, 'shock_var': 0.1
+    },
+    'Frozen': {
+        'Summer': 1.5, 'Heatwave Victoria': 1.3, 'Extreme Heat Adelaide': 1.25,
+        'Christmas': 1.3, 'School Holidays': 1.2,
+        'supply_chain_base': 1.1, 'weather_base': 1.25,
+        'shock_prob': 0.08, 'shock_var': 0.15
+    },
+    'Deli': {
+        'Christmas': 1.4, 'Easter': 1.25, 'Melbourne Cup': 1.18, 'King Birthday': 1.1,
+        'supply_chain_base': 1.08, 'shock_prob': 0.06, 'shock_var': 0.12
+    },
+    'Household': {
+        'Back to School': 1.15, 'Spring': 1.1, 'COVID Lockdown': 1.2,
+        'supply_chain_base': 1.05, 'shock_prob': 0.04, 'shock_var': 0.08
+    }
+}
+
+# %%
+# SUBCATEGORY-SPECIFIC SEASONAL EFFECTS
+# ====================================
+
+# FIXED: Realistic subcategory seasonal effects (reduced)
+subcat_seasonal_effects = {
+    'Fruit': {'Summer': 0.7, 'Winter': 1.4},  # Reduced from 0.4/2.5
+    'Vegetables (Leafy/Salad)': {'Summer': 1.15, 'Winter': 0.95},  # Reduced from 1.4/0.9
+    'Vegetables (Root/Onion/Garlic)': {'Winter': 0.9, 'Summer': 1.08},  # Reduced from 0.8/1.2
+    'Vegetables (Fruiting)': {'Summer': 0.8, 'Winter': 1.2},  # Reduced from 0.6/1.5
+    'Lamb': {'Easter': 1.4, 'Christmas': 1.25},  # Reduced from 2.0/1.6
+    'Turkey': {'Christmas': 1.8, 'Easter': 1.05},  # Reduced from 3.0/1.2
+    'Fish': {'Christmas': 1.5, 'Easter': 1.3, 'Summer': 1.1},  # Reduced from 2.4/1.8/1.3
+    'Prawns': {'Christmas': 2.0, 'Chinese New Year': 1.6},  # Reduced from 3.5/2.2
+    'Ice Cream': {'Summer': 2.2, 'Heatwave Victoria': 1.8, 'Winter': 0.5},  # Reduced from 4.0/3.0/0.3
+    'Frozen Vegetables': {'Winter': 1.1, 'COVID Lockdown': 1.2}  # Reduced from 1.3/1.5
+}
+
+# %%
+# SUBCATEGORY PROMOTION PROBABILITIES
+# ==================================
+
+# Dictionary of subcat apply probabilities
+subcat_apply_probs = {
+    'Pork': 0.4, 'Beef': 0.4, 'Chicken': 0.4, 'Prawns': 0.35, 'Pantry/Other': 0.15,
+    'Lamb': 0.35, 'Mixed Meat': 0.3, 'Salmon': 0.35, 'Fish': 0.35, 'Turkey': 0.3,
+    'Tuna': 0.3, 'Kangaroo': 0.3, 'Seafood': 0.35, 'Plant-Based': 0.25, 'Veal': 0.3,
+    'Duck': 0.3, 'Trout': 0.35, 'Mussels': 0.35, 'Venison': 0.3, 'Wallaby': 0.3,
+    'Crab': 0.35, 'Fruit': 0.25, 'Other Items (F&V Section)': 0.2,
+    'Vegetables (Fruiting)': 0.25, 'Vegetables (Root/Onion/Garlic)': 0.2,
+    'Vegetables (Stem/Flower/Pod)': 0.2, 'Vegetables (Leafy/Salad)': 0.25,
+    'Mushrooms': 0.25, 'Herbs/Sprouts': 0.2, 'Value-Added Produce': 0.2,
+    'Nuts/Seeds/Dried Fruit': 0.25, 'Yoghurt Specialty': 0.25, 'Butter Standard': 0.2,
+    'Milk Specialty': 0.2, 'Cheese Standard': 0.2, 'Cheese Specialty': 0.25,
+    'Butter Specialty': 0.2, 'Yoghurt Standard': 0.2, 'Outsider': 0.15,
+    'Cream Standard': 0.2, 'Milk Standard': 0.15, 'Eggs Standard': 0.2,
+    'Bread Loaves': 0.15, 'Wraps & Flatbreads': 0.2, 'Cakes & Slices': 0.3,
+    'Rolls & Buns': 0.2, 'Savoury Bakery Items': 0.25, 'Sourdough & Artisan Breads': 0.25,
+    'Sweet Pastries & Donuts': 0.3, 'Biscuits & Cookies': 0.3, 'Pancakes, Waffles & Crepes': 0.25,
+    'Muffins & Cupcakes': 0.3, 'Seafood (Processed/Cooked)': 0.35, 'Bacon': 0.35,
+    'Ham': 0.35, 'Platters/Kits': 0.3, 'Chicken (Processed/Cooked)': 0.35, 'Pantry': 0.15,
+    'Salami/Pepperoni/Chorizo': 0.25, 'Crackers/Breadsticks': 0.25, 'Antipasto/Olives/Pickles': 0.25,
+    'Turkey (Processed/Cooked)': 0.3, 'Beef (Processed/Cooked)': 0.35, 'Frankfurts/Sausages': 0.3,
+    'Pork (Processed/Cooked)': 0.35, 'Cheese': 0.2, 'Bakery': 0.25, 'Confectionery': 0.3,
+    'Other Deli': 0.2, 'Prepared Meals': 0.25, 'Dips/Pate': 0.25, 'Snacks (Sweet)': 0.3,
+    'Canned Goods': 0.15, 'Meal Kits/Bases/Instant Meals': 0.25, 'Breakfast Cereals': 0.25,
+    'Pasta/Rice/Noodles/Grains': 0.15, 'Snacks (Savoury)': 0.3, 'Beverages (Shelf-Stable)': 0.25,
+    'Spreads/Oils/Condiments': 0.2, 'Baking Mixes': 0.2, 'Baking Ingredients': 0.2,
+    'Other Pantry Items': 0.15, 'Juice/Smoothie': 0.25, 'Functional/Health Drink': 0.25,
+    'Other Drinks': 0.2, 'Soft Drink/Mixer': 0.3, 'Water': 0.15, 'Milk': 0.15,
+    'Non-Drink Item': 0.15, 'Tea': 0.2, 'Coffee': 0.25, 'Alcoholic Beverages (Low/No Alc)': 0.3,
+    'Frozen Chips': 0.3, 'Ice Cream': 0.3, 'Frozen Desserts': 0.3, 'Frozen Meat': 0.3,
+    'Frozen Poultry': 0.3, 'Frozen Fruits': 0.25, 'Frozen Vegetables': 0.25,
+    'Frozen Pastry': 0.25, 'Frozen Meals': 0.25, 'Frozen Seafood': 0.3, 'Other Frozen': 0.2,
+    'Stationery': 0.1, 'Dishwashing': 0.15, 'Bags': 0.15, 'Laundry Care': 0.2,
+    'Kitchenware & Food Storage': 0.2, 'Paper Products': 0.15, 'Cleaning Solutions & Wipes': 0.2,
+    'Cleaning Tools & Accessories': 0.15, 'Home Maintenance & General': 0.15,
+    'Air Care & Pest Control': 0.2, 'Vitamins & Supplements': 0.2, 'Skincare': 0.25,
+    'Wash Products': 0.2, 'First Aid & Wellness': 0.15, 'Health & Medicines': 0.15,
+    'Feminine & Incontinence Care': 0.15, 'Oral Care': 0.2, 'Deodorants & Antiperspirants': 0.25,
+    'Hair Care': 0.2, 'Shaving & Hair Removal': 0.2, 'First Aid & Wellness Accessories': 0.15,
+    'Deodorants & Body Sprays': 0.25, 'Medicines & Health Treatments': 0.15,
+    "Shaving & Men's Grooming": 0.2,
+}
+
+# %%
+# CORE PRICING AND ANALYSIS FUNCTIONS
+# ===================================
+
+def get_fortnight_col(date):
+    """Get fortnight column name for discount lookup"""
+    month_abbr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+    month_idx = date.month - 1
+    month_name = month_abbr[month_idx]
+    half = '01' if date.day <= 15 else '02'
+    return f"{month_name}-{half}"
+
+def get_season(date):
+    """Get season for Southern Hemisphere"""
+    month = date.month
+    if month in [12, 1, 2]:
+        return 'Summer'
+    elif month in [3, 4, 5]:
+        return 'Autumn'
+    elif month in [6, 7, 8]:
+        return 'Winter'
+    else:
+        return 'Spring'
+
+def is_event_active(date, event_dates, days_impact=7):
+    """Check if date is within impact period of an event"""
+    for event_date in event_dates:
+        if isinstance(event_date, list):
+            if len(event_date) == 2:
+                start_date, end_date = event_date
+                if start_date <= date <= end_date + timedelta(days=days_impact):
+                    return True
+        else:
+            if abs((date - event_date).days) <= days_impact:
+                return True
+    return False
+
+def apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct):
+    """Apply comprehensive pricing factors including all real-world events - FIXED VERSION"""
+    multipliers = category_multipliers.get(category, {})
+    subcat_effects = subcat_seasonal_effects.get(subcat, {})
+
+    mult = 1.0
+    event_description = []
+
+    # Seasonal effects
+    season = get_season(date)
+    if season in multipliers:
+        seasonal_mult = multipliers[season]
+        mult *= seasonal_mult
+        if seasonal_mult != 1.0:
+            event_description.append(f"{season} effect: {seasonal_mult:.2f}x")
+
+    # Subcat seasonal effects (override category if more specific)
+    if season in subcat_effects:
+        subcat_mult = subcat_effects[season]
+        mult = mult / multipliers.get(season, 1.0) * subcat_mult
+        event_description.append(f"{subcat} {season}: {subcat_mult:.2f}x")
+
+    # Holiday effects - REDUCED PROBABILITY
+    for holiday, dates in holidays.items():
+        if is_event_active(date, dates, days_impact=14):
+            # Only apply if random chance (reduce from 100% to 40%)
+            if np.random.rand() < 0.4:
+                if holiday in multipliers:
+                    holiday_mult = multipliers[holiday]
+                    mult *= holiday_mult
+                    event_description.append(f"{holiday}: {holiday_mult:.2f}x")
+                elif holiday in subcat_effects:
+                    holiday_mult = subcat_effects[holiday]
+                    mult *= holiday_mult
+                    event_description.append(f"{subcat} {holiday}: {holiday_mult:.2f}x")
+
+    # Supply chain disruptions - REDUCED PROBABILITY
+    for event, dates in supply_chain_events.items():
+        if is_event_active(date, dates, days_impact=21):
+            # Only apply if random chance (reduce from 100% to 20%)
+            if np.random.rand() < 0.2:
+                supply_mult = multipliers.get('supply_chain_base', 1.08)
+                if event in multipliers:
+                    supply_mult = multipliers[event]
+                mult *= supply_mult
+                event_description.append(f"Supply chain ({event}): {supply_mult:.2f}x")
+
+    # Weather events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY
+    for event, dates in weather_events.items():
+        if is_event_active(date, dates, days_impact=30):
+            # Only apply to relevant categories and reduce probability
+            weather_categories = ['Fruit & Vegetables', 'Meat & Seafood', 'Dairy, Eggs & Fridge']
+            if category in weather_categories and np.random.rand() < 0.25:
+                weather_mult = multipliers.get('weather_base', 1.1)
+                if event in multipliers:
+                    weather_mult = multipliers[event]
+                mult *= weather_mult
+                event_description.append(f"Weather ({event}): {weather_mult:.2f}x")
+
+    # Disease/pest events - CATEGORY-SPECIFIC AND REDUCED PROBABILITY
+    for event, dates in disease_events.items():
+        if is_event_active(date, dates, days_impact=60):
+            # Only apply to relevant categories
+            disease_categories = {
+                'Avian Flu': ['Meat & Seafood', 'Dairy, Eggs & Fridge'],
+                'Foot and Mouth Scare': ['Meat & Seafood', 'Dairy, Eggs & Fridge'],
+                'White Spot Prawns': ['Meat & Seafood'],
+                'Banana Disease': ['Fruit & Vegetables'],
+                'Citrus Canker': ['Fruit & Vegetables']
+            }
+            relevant_categories = disease_categories.get(event, [])
+            if category in relevant_categories and np.random.rand() < 0.3:
+                disease_mult = multipliers.get('disease_base', 1.2)
+                if event in multipliers:
+                    disease_mult = multipliers[event]
+                mult *= disease_mult
+                event_description.append(f"Disease ({event}): {disease_mult:.2f}x")
+
+    # Random market shocks - REDUCED PROBABILITY
+    shock_prob = multipliers.get('shock_prob', 0.05)
+    if np.random.rand() < shock_prob:
+        shock = np.random.normal(0, multipliers.get('shock_var', 0.1))
+        shock_mult = 1 + shock
+        mult *= shock_mult
+        if abs(shock) > 0.05:  # Only log significant shocks
+            event_description.append(f"Market shock: {shock_mult:.2f}x")
+
+    # Inflation trend (1.5-3% annually) - REDUCED
+    current_date = datetime(2025, 8, 19)
+    years_back = (current_date - date).days / 365.25
+    inflation_rate = np.random.uniform(0.015, 0.03)  # Reduced from 0.02-0.04
+    trend_mult = (1 + inflation_rate) ** years_back
+    mult *= trend_mult
+
+    # Competitor effects (random promotions) - REDUCED PROBABILITY
+    if np.random.rand() < 0.05:  # Reduced from 0.1
+        competitor_effect = np.random.uniform(0.92, 0.98)  # Reduced impact
+        mult *= competitor_effect
+        event_description.append(f"Competitor pressure: {competitor_effect:.2f}x")
+
+    # CRITICAL FIX: Enforce realistic bounds
+    mult = np.clip(mult, 0.5, 2.5)  # Prevent extreme multipliers
+
+    # Calculate final prices
+    normal_price = base_price * mult
+
+    # CRITICAL FIX: Ensure minimum price
+    normal_price = max(normal_price, base_price * 0.3)  # Never less than 30% of base
+
+    # Discount adjustments with realistic bounds
+    if mult > 1.2:
+        discount_multiplier = 1.1  # Reduced from 1.2
+    elif mult < 0.9:
+        discount_multiplier = 0.9  # Reduced from 0.8
+    else:
+        discount_multiplier = 1.0
+
+    adjusted_discount = base_discount_pct * discount_multiplier
+    adjusted_discount = max(0, min(0.7, adjusted_discount))  # Max 70% discount
+
+    return normal_price, adjusted_discount, event_description
+
+# %%
+# HISTORICAL DATA GENERATION
+# =========================
+
+def generate_initial_discounts():
+    """Generate initial discount data (first code block functionality)"""
+
+    # Load the main dataset
+    df = pd.read_csv('/Users/rajpatel/Desktop/coles_new.csv')
+    df.columns = df.columns.str.strip()
+
+    # Load the fortnightly discounts dataset
+    discounts_df = pd.read_csv('fortnightly_discounts.csv')
+    discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False)
+
+    # Clean numeric columns - fix the data type issue
+    discount_columns = [col for col in discounts_df.columns if '-' in col]  # Fortnight cols like Jan-01
+    for col in discount_columns:
+        # Clean any problematic values like "25 25" or other string issues
+        discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False)
+        discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0)
+
+    discounts_dict = discounts_df.set_index('Subcategory').to_dict('index')
+
+    # Set the current date
+    current_date = datetime(2025, 8, 5)
+    df['date'] = current_date
+
+    # Parse weights to grams
+    df['grams'] = df['weights'].apply(parse_to_grams)
+
+    # Compute price per 100g
+    price_col = 'item_price'
+    if price_col in df.columns:
+        df['price_per_100g'] = df.apply(
+            lambda row: (row[price_col] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None,
+            axis=1
+        )
+
+    # Get current fortnight column
+    fortnight_col = get_fortnight_col(current_date)
+
+    # Initialize discount columns
+    df['promo_flag'] = 0
+    df['discount_pct'] = 0.0
+    df['discounted_price'] = df[price_col]
+
+    # Apply discounts per item
+    for idx, row in df.iterrows():
+        subcat = row['subcat']
+        rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {}))
+        base_discount_pct = rule.get(fortnight_col, 0) / 100
+        apply_prob = subcat_apply_probs.get(subcat, 0.2)
+        promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob)
+
+        if promo_flag:
+            discount_pct = base_discount_pct + np.random.normal(0, 0.05)
+            discount_pct = np.clip(discount_pct, 0, 0.9)
+            df.at[idx, 'promo_flag'] = 1
+            df.at[idx, 'discount_pct'] = discount_pct
+            df.at[idx, 'discounted_price'] = row[price_col] * (1 - discount_pct)
+
+    # Save to new CSV
+    df.to_csv('coles_with_discounts.csv', index=False)
+    print("Initial discount data generated and saved to 'coles_with_discounts.csv'")
+
+    return df
+
+def generate_enhanced_historical_data():
+    """Main function to generate enhanced historical data"""
+
+    # Load the main dataset
+    df = pd.read_csv('/Users/rajpatel/Desktop/coles_new.csv')
+    df.columns = df.columns.str.strip()
+
+    # Load the fortnightly discounts dataset
+    discounts_df = pd.read_csv('fortnightly_discounts.csv')
+    discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False)
+
+    # Clean numeric columns - fix the data type issue
+    discount_columns = [col for col in discounts_df.columns if '-' in col]  # Fortnight cols like Jan-01
+    for col in discount_columns:
+        # Clean any problematic values like "25 25" or other string issues
+        discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False)
+        discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0)
+
+    discounts_dict = discounts_df.set_index('Subcategory').to_dict('index')
+
+    # Set the current date
+    current_date = datetime(2025, 8, 19)
+
+    # Generate historical dates (104 weeks = 2 years)
+    dates = pd.date_range(end=current_date, periods=104, freq='W')
+
+    # Parse weights and calculate price per 100g
+    df['grams'] = df['weights'].apply(parse_to_grams)
+    df['price_per_100g'] = df.apply(
+        lambda row: (row['item_price'] / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None,
+        axis=1
+    )
+
+    print(f"Generating enhanced historical data for {len(df)} products over {len(dates)} weeks...")
+
+    all_historical_data = []
+
+    for idx, (_, row) in enumerate(df.iterrows()):
+        if idx % 1000 == 0:
+            print(f"Processing product {idx+1}/{len(df)}")
+
+        subcat = row['subcat']
+        category = row['category']
+        base_price = row['item_price']
+
+        # Get discount rule for this subcat
+        rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {}))
+        apply_prob = subcat_apply_probs.get(subcat, 0.2)
+
+        for date in dates:
+            # Get base discount for this fortnight
+            fortnight = get_fortnight_col(date)
+            base_discount_pct = float(rule.get(fortnight, 0)) / 100
+
+            # Apply enhanced factors
+            normal_price, discount_pct, events = apply_enhanced_factors(
+                date, category, subcat, base_price, base_discount_pct
+            )
+
+            # Determine if promotion is active
+            promo_flag = (base_discount_pct > 0) and (np.random.rand() < apply_prob)
+            if promo_flag:
+                discount_pct = discount_pct + np.random.normal(0, 0.05)
+                discount_pct = np.clip(discount_pct, 0, 0.9)
+                final_discount = discount_pct
+            else:
+                final_discount = 0
+
+            discounted_price = normal_price * (1 - final_discount)
+
+            # Store the record
+            record = {
+                'date': date,
+                'product_code': row['product_code'],
+                'category': category,
+                'essential_flag': row['essential_flag'],
+                'item_name': row['item_name'],
+                'subcat': subcat,
+                'weights': row['weights'],
+                'unit_price': row.get('unit_price', 0),
+                'brand_name': row['brand_name'],
+                'grams': row['grams'],
+                'price_per_100g': (discounted_price / row['grams']) * 100 if pd.notna(row['grams']) and row['grams'] > 0 else None,
+                'normal_price': round(normal_price, 2),
+                'promo_flag': int(promo_flag),
+                'discount_pct': round(final_discount, 4),
+                'discounted_price': round(discounted_price, 2),
+                'price_multiplier': round(normal_price / base_price, 3),
+                'events_active': '; '.join(events) if events else 'None'
+            }
+
+            all_historical_data.append(record)
+
+    # Convert to DataFrame
+    historical_df = pd.DataFrame(all_historical_data)
+
+    print(f"Generated {len(historical_df):,} historical records")
+    return historical_df
+
+# %%
+# SIMPLIFIED CATEGORY-BASED GENERATION (ALTERNATIVE APPROACH)
+# ==========================================================
+
+def generate_synthetic_for_category(category_df, dates):
+    """Generate synthetic data for a specific category"""
+    synthetic_dfs = []
+
+    for _, row in category_df.iterrows():
+        subcat = row['subcat']
+        category = row['category']
+        base_price = row['item_price']
+        rule = discounts_dict.get(subcat, discounts_dict.get('Pantry/Other', {}))
+        apply_prob = subcat_apply_probs.get(subcat, 0.2)
+
+        item_df = pd.DataFrame({'date': dates})
+        item_df['product_code'] = row['product_code']
+        item_df['item_name'] = row['item_name']
+        item_df['brand_name'] = row['brand_name']
+        item_df['weights'] = row['weights']
+        item_df['grams'] = row['grams']
+        item_df['price_per_100g'] = row['price_per_100g']
+        item_df['subcat'] = subcat
+        item_df['category'] = category
+
+        item_df['normal_price'] = np.nan
+        item_df['discount_pct'] = 0.0
+        item_df['promo_flag'] = 0
+        item_df['discounted_price'] = np.nan
+
+        for i, date in enumerate(dates):
+            fortnight = get_fortnight_col(date)
+            base_discount_pct = float(rule.get(fortnight, 0)) / 100
+            normal_price, discount_pct = apply_enhanced_factors(date, category, subcat, base_price, base_discount_pct)[:2]
+
+            promo_flag = (discount_pct > 0) and (np.random.rand() < apply_prob)
+            item_df.at[i, 'normal_price'] = normal_price
+            item_df.at[i, 'promo_flag'] = 1 if promo_flag else 0
+            item_df.at[i, 'discount_pct'] = discount_pct if promo_flag else 0
+            item_df.at[i, 'discounted_price'] = normal_price * (1 - item_df.at[i, 'discount_pct'])
+
+        synthetic_dfs.append(item_df)
+    return pd.concat(synthetic_dfs, ignore_index=True)
+
+# %%
+# EXPLORATORY DATA ANALYSIS FUNCTIONS
+# ===================================
+
+def create_essential_eda(df):
+    """Create 5 essential EDA plots"""
+
+    print("Creating Essential EDA Plots...")
+
+    # Setup modern style
+    sns.set_style("whitegrid")
+    plt.rcParams['figure.figsize'] = (12, 6)
+    plt.rcParams['font.size'] = 12
+
+    # 1. Time Series Analysis - Average Prices by Category
+    monthly_avg = df.groupby([df['date'].dt.to_period('M'), 'category'])['discounted_price'].mean().reset_index()
+    monthly_avg['date'] = monthly_avg['date'].dt.to_timestamp()
+
+    fig1 = px.line(
+        monthly_avg,
+        x='date',
+        y='discounted_price',
+        color='category',
+        title='1. Average Monthly Prices by Category Over Time',
+        labels={'discounted_price': 'Average Price (AUD)', 'date': 'Date'}
+    )
+    fig1.update_layout(height=600, hovermode='x unified')
+    fig1.show()
+
+    # 2. Event Impact Analysis
+    event_impact = df.copy()
+    event_impact['has_events'] = event_impact['events_active'] != 'None'
+    comparison = event_impact.groupby(['category', 'has_events'])['price_multiplier'].mean().reset_index()
+    comparison['event_status'] = comparison['has_events'].map({True: 'With Events', False: 'Normal'})
+
+    fig2 = px.bar(
+        comparison,
+        x='category',
+        y='price_multiplier',
+        color='event_status',
+        title='2. Price Impact: Normal vs Event Periods',
+        labels={'price_multiplier': 'Average Price Multiplier'},
+        barmode='group'
+    )
+    fig2.update_layout(height=600)
+    fig2.update_xaxes(tickangle=45)
+    fig2.show()
+
+    # 3. Seasonal Patterns Heatmap
+    df['month'] = df['date'].dt.month
+    seasonal_data = df.groupby(['category', 'month'])['discounted_price'].mean().reset_index()
+    seasonal_pivot = seasonal_data.pivot(index='category', columns='month', values='discounted_price')
+
+    fig3 = go.Figure(data=go.Heatmap(
+        z=seasonal_pivot.values,
+        x=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
+        y=seasonal_pivot.index,
+        colorscale='RdYlBu_r',
+        text=np.round(seasonal_pivot.values, 2),
+        texttemplate="%{text}",
+        textfont={"size": 10},
+        hoverongaps=False
+    ))
+    fig3.update_layout(
+        title='3. Seasonal Price Patterns by Category',
+        height=600,
+        xaxis_title="Month",
+        yaxis_title="Category"
+    )
+    fig3.show()
+
+    # 4. Price Volatility by Category
+    volatility = df.groupby(['category', df['date'].dt.to_period('M')])['discounted_price'].agg(['mean', 'std']).reset_index()
+    volatility['date'] = volatility['date'].dt.to_timestamp()
+    volatility['cv'] = volatility['std'] / volatility['mean'] * 100  # Coefficient of variation
+
+    fig4 = px.line(
+        volatility,
+        x='date',
+        y='cv',
+        color='category',
+        title='4. Price Volatility (Coefficient of Variation) by Category',
+        labels={'cv': 'Coefficient of Variation (%)', 'date': 'Date'}
+    )
+    fig4.update_layout(height=600)
+    fig4.show()
+
+    # 5. Distribution Overview
+    fig5 = make_subplots(
+        rows=2, cols=2,
+        subplot_titles=('Price Distribution', 'Discount Distribution', 'Promotion Rate by Category', 'Price Multiplier Distribution'),
+        specs=[[{'type': 'histogram'}, {'type': 'histogram'}],
+               [{'type': 'bar'}, {'type': 'histogram'}]]
+    )
+
+    # Price distribution
+    fig5.add_trace(
+        go.Histogram(x=df['discounted_price'], nbinsx=50, name='Price Distribution'),
+        row=1, col=1
+    )
+
+    # Discount distribution (only when discount > 0)
+    discount_data = df[df['discount_pct'] > 0]['discount_pct'] * 100
+    fig5.add_trace(
+        go.Histogram(x=discount_data, nbinsx=30, name='Discount Distribution'),
+        row=1, col=2
+    )
+
+    # Promotion rates by category
+    promo_rates = df.groupby('category')['promo_flag'].mean() * 100
+    fig5.add_trace(
+        go.Bar(x=promo_rates.index, y=promo_rates.values, name='Promotion Rate %'),
+        row=2, col=1
+    )
+
+    # Price multiplier distribution
+    fig5.add_trace(
+        go.Histogram(x=df['price_multiplier'], nbinsx=50, name='Price Multiplier'),
+        row=2, col=2
+    )
+
+    fig5.update_layout(
+        title="5. Data Distribution Overview",
+        height=800,
+        showlegend=False
+    )
+    fig5.update_xaxes(tickangle=45, row=2, col=1)
+    fig5.show()
+
+    # Summary Statistics
+    print("\nSUMMARY STATISTICS:")
+    print("="*50)
+
+    for category in df['category'].unique():
+        cat_data = df[df['category'] == category]
+        print(f"\n{category}:")
+        print(f"  • Records: {len(cat_data):,}")
+        print(f"  • Avg Price: ${cat_data['discounted_price'].mean():.2f}")
+        print(f"  • Price Range: ${cat_data['discounted_price'].min():.2f} - ${cat_data['discounted_price'].max():.2f}")
+        print(f"  • Promotion Rate: {cat_data['promo_flag'].mean()*100:.1f}%")
+        print(f"  • Avg Discount: {cat_data['discount_pct'].mean()*100:.1f}%")
+        print(f"  • Avg Price Multiplier: {cat_data['price_multiplier'].mean():.2f}x")
+
+def analyze_time_series_decomposition(df, category='Meat & Seafood'):
+    """Perform time series decomposition for a specific category"""
+
+    print(f"\nTime Series Decomposition Analysis for {category}")
+
+    # Aggregate data by date for the category
+    ts_data = df[df['category'] == category].groupby('date')['discounted_price'].mean()
+    ts_data = ts_data.asfreq('W', method='ffill')
+
+    if len(ts_data) >= 52:  # Need at least 1 year
+        try:
+            decomposition = seasonal_decompose(ts_data, model='multiplicative', period=52)
+
+            fig = make_subplots(
+                rows=4, cols=1,
+                subplot_titles=('Original', 'Trend', 'Seasonal', 'Residual'),
+                vertical_spacing=0.08
+            )
+
+            fig.add_trace(go.Scatter(x=ts_data.index, y=ts_data.values,
+                                   mode='lines', name='Original'), row=1, col=1)
+            fig.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend.values,
+                                   mode='lines', name='Trend'), row=2, col=1)
+            fig.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal.values,
+                                   mode='lines', name='Seasonal'), row=3, col=1)
+            fig.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid.values,
+                                   mode='lines', name='Residual'), row=4, col=1)
+
+            fig.update_layout(
+                title=f'Time Series Decomposition - {category}',
+                height=1000,
+                showlegend=False
+            )
+            fig.show()
+
+            print(f"Decomposition completed for {category}")
+            return decomposition
+
+        except Exception as e:
+            print(f"Decomposition failed: {e}")
+            return None
+    else:
+        print(f"Insufficient data for decomposition ({len(ts_data)} weeks)")
+        return None
+
+# %%
+# MAIN EXECUTION FUNCTIONS
+# ========================
+
+def run_complete_analysis():
+    """Run the complete enhanced historical data generation and analysis"""
+
+    print("Starting Enhanced Historical Data Generation")
+    print("="*60)
+
+    # Generate enhanced historical data
+    historical_df = generate_enhanced_historical_data()
+
+    # Save to CSV
+    output_file = 'enhanced_historical_data.csv'
+    historical_df.to_csv(output_file, index=False)
+    print(f"Saved to {output_file}")
+
+    # Create essential EDA plots
+    print("\nCreating Essential EDA Plots...")
+    create_essential_eda(historical_df)
+
+    # Event analysis summary
+    event_records = historical_df[historical_df['events_active'] != 'None']
+    if len(event_records) > 0:
+        print(f"\nEvent Impact Summary:")
+        print(f"   • {len(event_records):,} records affected by events ({len(event_records)/len(historical_df)*100:.1f}%)")
+
+        # Count events
+        all_events = []
+        for events in event_records['events_active']:
+            all_events.extend([e.split(':')[0].strip() for e in events.split(';')])
+
+        from collections import Counter
+        event_counts = Counter(all_events)
+        print("\nTop 10 Most Frequent Events:")
+        for event, count in event_counts.most_common(10):
+            print(f"   • {event}: {count:,} occurrences")
+
+    print(f"\nAnalysis Complete!")
+    print(f"Generated {len(historical_df):,} records for ARIMA/LSTM modeling")
+    print(f"Date range: {historical_df['date'].min().date()} to {historical_df['date'].max().date()}")
+
+    return historical_df
+
+def run_category_based_generation():
+    """Alternative approach: Generate data by category"""
+
+    # Load data
+    df = pd.read_csv('coles_with_discounts.csv')
+    df.columns = df.columns.str.strip()
+
+    # Load discounts dict
+    discounts_df = pd.read_csv('/Users/rajpatel/Desktop/fortnightly_discounts.csv')
+    discounts_df['Subcategory'] = discounts_df['Subcategory'].str.strip().str.replace('chiken', 'Chicken', regex=False)
+
+    # Clean numeric columns - fix the data type issue
+    discount_columns = [col for col in discounts_df.columns if '-' in col]
+    for col in discount_columns:
+        # Clean any problematic values like "25 25" or other string issues
+        discounts_df[col] = discounts_df[col].astype(str).str.replace(' ', '', regex=False)
+        discounts_df[col] = pd.to_numeric(discounts_df[col], errors='coerce').fillna(0)
+
+    global discounts_dict
+    discounts_dict = discounts_df.set_index('Subcategory').to_dict('index')
+
+    # Current date and historical dates
+    current_date = datetime(2025, 8, 19)
+    dates = pd.date_range(end=current_date, periods=104, freq='W')
+
+    # Unique categories from dataset
+    unique_categories = df['category'].unique()
+
+    # Process categories
+    all_category_data = []
+    for cat in unique_categories:
+        cat_df = df[df['category'] == cat]
+        if not cat_df.empty:
+            synth_cat = generate_synthetic_for_category(cat_df, dates)
+            synth_cat.to_csv(f'synthetic_{cat.replace(" ", "_").replace("&", "and")}.csv', index=False)
+            all_category_data.append(synth_cat)
+            print(f"Generated for {cat}: {len(synth_cat)} rows")
+
+    # Combine all categories
+    full_synth = pd.concat(all_category_data, ignore_index=True)
+    full_synth.to_csv('synthetic_historical_data_v6.csv', index=False)
+
+    print(f"Complete dataset saved: {len(full_synth)} records")
+    return full_synth
+
+# %%
+# DATA INSPECTION AND EDA OUTPUT CELL
+# ===================================
+
+def inspect_and_analyze_data():
+    """
+    Complete data inspection and EDA output to see how the data is performing
+    This function will show you everything about your dataset
+    """
+
+    print("=" * 80)
+    print("GROCERY PRICE ANALYSIS - COMPLETE DATA INSPECTION")
+    print("=" * 80)
+
+    try:
+        # Try to load existing enhanced data first
+        print("\n1. LOADING DATA...")
+        try:
+            df = pd.read_csv('enhanced_historical_data.csv')
+            df['date'] = pd.to_datetime(df['date'])
+            print("✓ Loaded existing enhanced historical data")
+            data_source = "Enhanced Historical Data"
+        except FileNotFoundError:
+            try:
+                df = pd.read_csv('synthetic_historical_data_v6.csv')
+                df['date'] = pd.to_datetime(df['date'])
+                print("✓ Loaded existing synthetic data")
+                data_source = "Synthetic Historical Data"
+            except FileNotFoundError:
+                print("⚠ No existing data found. Generating new data...")
+                df = run_complete_analysis()
+                data_source = "Newly Generated Data"
+
+        print(f"Data Source: {data_source}")
+        print(f"Data Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
+
+        # 2. DATA QUALITY ASSESSMENT
+        print("\n" + "=" * 80)
+        print("2. DATA QUALITY ASSESSMENT")
+        print("=" * 80)
+
+        print(f"\nDATE RANGE:")
+        print(f"  • Start Date: {df['date'].min().strftime('%Y-%m-%d')}")
+        print(f"  • End Date: {df['date'].max().strftime('%Y-%m-%d')}")
+        print(f"  • Total Weeks: {df['date'].nunique()}")
+        print(f"  • Date Coverage: {(df['date'].max() - df['date'].min()).days} days")
+
+        print(f"\nMISSING VALUES:")
+        missing = df.isnull().sum()
+        missing_pct = (missing / len(df) * 100).round(2)
+        for col in df.columns:
+            if missing[col] > 0:
+                print(f"  • {col}: {missing[col]:,} ({missing_pct[col]}%)")
+
+        print(f"\nDATA TYPES:")
+        for col, dtype in df.dtypes.items():
+            unique_count = df[col].nunique()
+            print(f"  • {col}: {dtype} ({unique_count:,} unique values)")
+
+        print(f"\nCATEGORY BREAKDOWN:")
+        category_counts = df['category'].value_counts()
+        for cat, count in category_counts.items():
+            pct = (count / len(df) * 100)
+            print(f"  • {cat}: {count:,} records ({pct:.1f}%)")
+
+        # 3. PRICE ANALYSIS
+        print("\n" + "=" * 80)
+        print("3. PRICE ANALYSIS")
+        print("=" * 80)
+
+        print(f"\nOVERALL PRICE STATISTICS:")
+        price_stats = df['discounted_price'].describe()
+        print(f"  • Mean Price: ${price_stats['mean']:.2f}")
+        print(f"  • Median Price: ${price_stats['50%']:.2f}")
+        print(f"  • Price Range: ${price_stats['min']:.2f} - ${price_stats['max']:.2f}")
+        print(f"  • Standard Deviation: ${price_stats['std']:.2f}")
+        print(f"  • 25th Percentile: ${price_stats['25%']:.2f}")
+        print(f"  • 75th Percentile: ${price_stats['75%']:.2f}")
+
+        print(f"\nPRICE BY CATEGORY:")
+        for category in df['category'].unique():
+            cat_data = df[df['category'] == category]['discounted_price']
+            print(f"  • {category}:")
+            print(f"    - Mean: ${cat_data.mean():.2f}")
+            print(f"    - Median: ${cat_data.median():.2f}")
+            print(f"    - Range: ${cat_data.min():.2f} - ${cat_data.max():.2f}")
+
+        # 4. PROMOTION AND DISCOUNT ANALYSIS
+        print("\n" + "=" * 80)
+        print("4. PROMOTION AND DISCOUNT ANALYSIS")
+        print("=" * 80)
+
+        overall_promo_rate = df['promo_flag'].mean() * 100
+        print(f"\nOVERALL PROMOTION RATE: {overall_promo_rate:.1f}%")
+
+        promoted_items = df[df['promo_flag'] == 1]
+        if len(promoted_items) > 0:
+            avg_discount = promoted_items['discount_pct'].mean() * 100
+            print(f"AVERAGE DISCOUNT (when promoted): {avg_discount:.1f}%")
+
+            print(f"\nPROMOTION RATES BY CATEGORY:")
+            for category in df['category'].unique():
+                cat_promo_rate = df[df['category'] == category]['promo_flag'].mean() * 100
+                cat_avg_discount = df[(df['category'] == category) & (df['promo_flag'] == 1)]['discount_pct'].mean() * 100
+                print(f"  • {category}: {cat_promo_rate:.1f}% promo rate, {cat_avg_discount:.1f}% avg discount")
+
+        # 5. EVENT IMPACT ANALYSIS
+        print("\n" + "=" * 80)
+        print("5. EVENT IMPACT ANALYSIS")
+        print("=" * 80)
+
+        if 'events_active' in df.columns:
+            event_records = df[df['events_active'] != 'None']
+            event_impact_rate = len(event_records) / len(df) * 100
+            print(f"\nEVENT IMPACT RATE: {event_impact_rate:.1f}% of records affected by events")
+
+            if len(event_records) > 0:
+                # Event frequency analysis
+                all_events = []
+                for events in event_records['events_active']:
+                    if pd.notna(events) and events != 'None':
+                        all_events.extend([e.split(':')[0].strip() for e in str(events).split(';')])
+
+                if all_events:
+                    from collections import Counter
+                    event_counts = Counter(all_events)
+                    print(f"\nTOP 10 MOST FREQUENT EVENTS:")
+                    for event, count in event_counts.most_common(10):
+                        pct = (count / len(df) * 100)
+                        print(f"  • {event}: {count:,} occurrences ({pct:.2f}%)")
+
+                # Price impact of events
+                normal_prices = df[df['events_active'] == 'None']['discounted_price'].mean()
+                event_prices = event_records['discounted_price'].mean()
+                price_impact = ((event_prices - normal_prices) / normal_prices * 100)
+                print(f"\nPRICE IMPACT OF EVENTS:")
+                print(f"  • Normal periods avg price: ${normal_prices:.2f}")
+                print(f"  • Event periods avg price: ${event_prices:.2f}")
+                print(f"  • Price increase during events: {price_impact:.1f}%")
+
+        # 6. TEMPORAL PATTERNS
+        print("\n" + "=" * 80)
+        print("6. TEMPORAL PATTERNS")
+        print("=" * 80)
+
+        df['month'] = df['date'].dt.month
+        df['year'] = df['date'].dt.year
+        df['quarter'] = df['date'].dt.quarter
+
+        print(f"\nMONTHLY PRICE PATTERNS:")
+        monthly_prices = df.groupby('month')['discounted_price'].mean()
+        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
+                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+        for month, price in monthly_prices.items():
+            print(f"  • {month_names[month-1]}: ${price:.2f}")
+
+        print(f"\nQUARTERLY TRENDS:")
+        quarterly_prices = df.groupby('quarter')['discounted_price'].mean()
+        for quarter, price in quarterly_prices.items():
+            print(f"  • Q{quarter}: ${price:.2f}")
+
+        # 7. CREATE ALL VISUALIZATIONS
+        print("\n" + "=" * 80)
+        print("7. CREATING VISUALIZATIONS")
+        print("=" * 80)
+
+        create_essential_eda(df)
+
+        # 8. ADDITIONAL INSIGHTS
+        print("\n" + "=" * 80)
+        print("8. KEY INSIGHTS AND RECOMMENDATIONS")
+        print("=" * 80)
+
+        # Price volatility analysis
+        price_volatility = df.groupby('category')['discounted_price'].std().sort_values(ascending=False)
+        print(f"\nMOST VOLATILE CATEGORIES (by price std dev):")
+        for category, volatility in price_volatility.head().items():
+            print(f"  • {category}: ${volatility:.2f} std dev")
+
+        # Best promotion opportunities
+        low_promo_categories = df.groupby('category')['promo_flag'].mean().sort_values().head()
+        print(f"\nCATEGORIES WITH LOWEST PROMOTION RATES (opportunities):")
+        for category, rate in low_promo_categories.items():
+            print(f"  • {category}: {rate*100:.1f}% promotion rate")
+
+        # Seasonal opportunities
+        summer_winter_diff = df[df['month'].isin([12, 1, 2])]['discounted_price'].mean() - df[df['month'].isin([6, 7, 8])]['discounted_price'].mean()
+        print(f"\nSEASONAL PRICE DIFFERENCE:")
+        print(f"  • Summer vs Winter avg price difference: ${summer_winter_diff:.2f}")
+
+        print("\n" + "=" * 80)
+        print("9. DATA QUALITY SUMMARY")
+        print("=" * 80)
+
+        # Data completeness score
+        completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+        print(f"\nDATA COMPLETENESS: {completeness:.1f}%")
+
+        # Data distribution health
+        price_outliers = len(df[df['discounted_price'] > df['discounted_price'].quantile(0.99)])
+        print(f"PRICE OUTLIERS (>99th percentile): {price_outliers:,} records ({price_outliers/len(df)*100:.2f}%)")
+
+        # Time series regularity
+        date_gaps = df['date'].drop_duplicates().sort_values().diff().dropna()
+        regular_intervals = (date_gaps == pd.Timedelta(days=7)).mean() * 100
+        print(f"TIME SERIES REGULARITY: {regular_intervals:.1f}% regular weekly intervals")
+
+        print(f"\n" + "=" * 80)
+        print("ANALYSIS COMPLETE!")
+        print("=" * 80)
+        print(f"Dataset is ready for ARIMA/LSTM modeling")
+        print(f"Key features available: prices, promotions, events, seasonality")
+        print(f"Recommended next steps:")
+        print(f"  1. Use price_multiplier and events_active as external regressors")
+        print(f"  2. Consider category-specific models due to different volatility patterns")
+        print(f"  3. Leverage seasonal decomposition for feature engineering")
+
+        return df
+
+    except Exception as e:
+        print(f" Error during analysis: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+# %%
+# RUN COMPLETE INSPECTION AND ANALYSIS
+# ====================================
+
+# Execute this cell to see all data performance metrics and visualizations
+df_analyzed = inspect_and_analyze_data()
+
+# If you want to run time series decomposition on specific categories:
+if df_analyzed is not None and len(df_analyzed) > 0:
+    print("\n" + "="*80)
+    print("BONUS: TIME SERIES DECOMPOSITION")
+    print("="*80)
+
+    # Run decomposition for top 3 categories
+    top_categories = df_analyzed['category'].value_counts().head(3).index
+
+    for category in top_categories:
+        print(f"\nAnalyzing {category}...")
+        decomp_result = analyze_time_series_decomposition(df_analyzed, category)
+        if decomp_result is not None:
+            print(f"✓ Decomposition completed for {category}")
+        else:
+            print(f"⚠ Could not decompose {category} - insufficient data")
+
+print("\n🎉 Complete analysis finished! All outputs are displayed above.")
+
+# %%
+# FEATURE ENGINEERING MODULE
+# ==========================
+
+def create_lag_features(df, price_col='discounted_price', product_id_col='product_code'):
+    """Create lag features for time series modeling"""
+
+    print("Creating lag features...")
+    df = df.sort_values([product_id_col, 'date']).copy()
+
+    # Create lag features per product
+    lag_features = ['price_lag_1', 'price_lag_7', 'price_lag_52']
+    lag_periods = [1, 7, 52]  # 1 week, 7 weeks, 52 weeks (yearly)
+
+    for feature, lag in zip(lag_features, lag_periods):
+        df[feature] = df.groupby(product_id_col)[price_col].shift(lag)
+
+    # Calculate price changes
+    df['price_change_1w'] = df[price_col] - df['price_lag_1']
+    df['price_change_7w'] = df[price_col] - df['price_lag_7']
+    df['price_change_52w'] = df[price_col] - df['price_lag_52']
+
+    # Calculate percentage changes
+    df['price_pct_change_1w'] = (df['price_change_1w'] / df['price_lag_1']).fillna(0)
+    df['price_pct_change_7w'] = (df['price_change_7w'] / df['price_lag_7']).fillna(0)
+    df['price_pct_change_52w'] = (df['price_change_52w'] / df['price_lag_52']).fillna(0)
+
+    print(f"✓ Created {len(lag_features)} lag features and 6 change features")
+    return df
+
+def create_moving_averages(df, price_col='discounted_price', product_id_col='product_code'):
+    """Create moving average features"""
+
+    print("Creating moving average features...")
+    df = df.sort_values([product_id_col, 'date']).copy()
+
+    # Define moving average windows
+    windows = [7, 30, 90]  # 7, 30, 90 days (converted to weeks: ~1, 4, 13 weeks)
+    week_windows = [1, 4, 13]
+
+    for window, week_window in zip(windows, week_windows):
+        col_name = f'ma_{window}d'
+        df[col_name] = df.groupby(product_id_col)[price_col].rolling(
+            window=week_window, min_periods=1
+        ).mean().reset_index(0, drop=True)
+
+        # Calculate deviation from moving average
+        df[f'price_dev_ma_{window}d'] = df[price_col] - df[col_name]
+        df[f'price_dev_pct_ma_{window}d'] = (df[f'price_dev_ma_{window}d'] / df[col_name]).fillna(0)
+
+    print(f"✓ Created {len(windows)} moving averages and {len(windows)*2} deviation features")
+    return df
+
+def create_event_indicators(df, events_col='events_active'):
+    """Create binary indicators from events_active column"""
+
+    print("Creating event indicator features...")
+
+    # Initialize event indicator columns
+    event_indicators = {
+        'has_seasonal_event': 0,
+        'has_holiday_event': 0,
+        'has_weather_event': 0,
+        'has_supply_chain_event': 0,
+        'has_disease_event': 0,
+        'has_market_shock': 0,
+        'has_competitor_pressure': 0,
+        'event_count': 0
+    }
+
+    # Add columns
+    for col in event_indicators.keys():
+        df[col] = 0
+
+    # Process each row
+    for idx, row in df.iterrows():
+        events = str(row[events_col]).lower()
+        event_count = 0
+
+        if events != 'none' and events != 'nan':
+            event_list = [e.strip() for e in events.split(';')]
+            event_count = len(event_list)
+
+            for event in event_list:
+                if any(season in event for season in ['summer', 'winter', 'spring', 'autumn']):
+                    df.at[idx, 'has_seasonal_event'] = 1
+                elif any(holiday in event for holiday in ['christmas', 'easter', 'new year', 'valentine', 'mother', 'father']):
+                    df.at[idx, 'has_holiday_event'] = 1
+                elif 'weather' in event:
+                    df.at[idx, 'has_weather_event'] = 1
+                elif 'supply chain' in event:
+                    df.at[idx, 'has_supply_chain_event'] = 1
+                elif 'disease' in event:
+                    df.at[idx, 'has_disease_event'] = 1
+                elif 'market shock' in event:
+                    df.at[idx, 'has_market_shock'] = 1
+                elif 'competitor' in event:
+                    df.at[idx, 'has_competitor_pressure'] = 1
+
+        df.at[idx, 'event_count'] = event_count
+
+    print(f"✓ Created {len(event_indicators)} event indicator features")
+    return df
+
+def create_calendar_features(df, date_col='date'):
+    """Create calendar-based features"""
+
+    print("Creating calendar features...")
+
+    # Basic calendar features
+    df['year'] = df[date_col].dt.year
+    df['quarter'] = df[date_col].dt.quarter
+    df['month'] = df[date_col].dt.month
+    df['week_of_year'] = df[date_col].dt.isocalendar().week
+    df['day_of_year'] = df[date_col].dt.dayofyear
+    df['is_month_start'] = df[date_col].dt.is_month_start.astype(int)
+    df['is_month_end'] = df[date_col].dt.is_month_end.astype(int)
+    df['is_quarter_start'] = df[date_col].dt.is_quarter_start.astype(int)
+    df['is_quarter_end'] = df[date_col].dt.is_quarter_end.astype(int)
+
+    # Season indicators (Southern Hemisphere)
+    def get_season_indicator(month):
+        if month in [12, 1, 2]:
+            return 'summer'
+        elif month in [3, 4, 5]:
+            return 'autumn'
+        elif month in [6, 7, 8]:
+            return 'winter'
+        else:
+            return 'spring'
+
+    df['season'] = df['month'].apply(get_season_indicator)
+
+    # Create season dummies
+    for season in ['summer', 'autumn', 'winter', 'spring']:
+        df[f'is_{season}'] = (df['season'] == season).astype(int)
+
+    # Holiday indicators
+    def is_holiday_period(date):
+        # Check if date is within 2 weeks of major holidays
+        month, day = date.month, date.day
+
+        # Christmas/New Year period
+        if (month == 12 and day >= 15) or (month == 1 and day <= 15):
+            return 1
+        # Easter period (approximate - around April)
+        elif month == 4 and 1 <= day <= 30:
+            return 1
+        # School holiday periods (approximate)
+        elif (month == 7 and 1 <= day <= 31) or (month == 1 and 16 <= day <= 31):
+            return 1
+        return 0
+
+    df['is_holiday_period'] = df[date_col].apply(is_holiday_period)
+
+    # School holiday flags
+    def is_school_holiday(date):
+        month, day = date.month, date.day
+        # Summer holidays: Dec 15 - Jan 31
+        if (month == 12 and day >= 15) or (month == 1):
+            return 1
+        # Winter holidays: July
+        elif month == 7:
+            return 1
+        # Spring holidays: late September/early October
+        elif month == 9 and day >= 20:
+            return 1
+        elif month == 10 and day <= 10:
+            return 1
+        # Autumn holidays: April
+        elif month == 4:
+            return 1
+        return 0
+
+    df['school_holiday_flag'] = df[date_col].apply(is_school_holiday)
+
+    # Days to next major holiday
+    def days_to_holiday(date):
+        month, day = date.month, date.day
+        current_date = date.replace(hour=0, minute=0, second=0, microsecond=0)
+
+        # Define major holidays for current year
+        year = date.year
+        holidays = [
+            datetime(year, 1, 1),    # New Year
+            datetime(year, 4, 15),   # Easter (approximate)
+            datetime(year, 7, 1),    # School holidays
+            datetime(year, 12, 25),  # Christmas
+        ]
+
+        # Find next holiday
+        future_holidays = [h for h in holidays if h >= current_date]
+        if not future_holidays:
+            # If no holidays left this year, add next year's New Year
+            future_holidays = [datetime(year + 1, 1, 1)]
+
+        next_holiday = min(future_holidays)
+        return (next_holiday - current_date).days
+
+    df['days_to_holiday'] = df[date_col].apply(days_to_holiday)
+
+    # Cyclical encoding for month and week
+    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+    df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
+    df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)
+
+    print("✓ Created 25+ calendar-based features")
+    return df
+
+def create_volatility_measures(df, price_col='discounted_price', product_id_col='product_code'):
+    """Calculate price volatility measures per category and product"""
+
+    print("Creating volatility measures...")
+
+    # Sort data
+    df = df.sort_values([product_id_col, 'date']).copy()
+
+    # Rolling volatility (standard deviation) over different windows
+    windows = [4, 13, 26]  # 1 month, 3 months, 6 months (in weeks)
+
+    for window in windows:
+        # Rolling standard deviation
+        df[f'volatility_{window}w'] = df.groupby(product_id_col)[price_col].rolling(
+            window=window, min_periods=2
+        ).std().reset_index(0, drop=True)
+
+        # Coefficient of variation (volatility relative to mean)
+        rolling_mean = df.groupby(product_id_col)[price_col].rolling(
+            window=window, min_periods=2
+        ).mean().reset_index(0, drop=True)
+
+        df[f'cv_{window}w'] = (df[f'volatility_{window}w'] / rolling_mean).fillna(0)
+
+    # Category-level volatility measures
+    category_volatility = df.groupby(['category', 'date'])[price_col].agg(['mean', 'std']).reset_index()
+    category_volatility['category_cv'] = category_volatility['std'] / category_volatility['mean']
+    category_volatility = category_volatility[['category', 'date', 'category_cv']].fillna(0)
+
+    # Merge back to main dataframe
+    df = df.merge(category_volatility, on=['category', 'date'], how='left')
+
+    # Price dispersion within category-date
+    df['price_rank_in_category'] = df.groupby(['category', 'date'])[price_col].rank(pct=True)
+
+    print(f"✓ Created {len(windows)*2 + 2} volatility and ranking features")
+    return df
+
+def engineer_all_features(df):
+    """Main function to engineer all features"""
+
+    print("=" * 60)
+    print("FEATURE ENGINEERING PIPELINE")
+    print("=" * 60)
+
+    print(f"Starting with {df.shape[0]:,} records and {df.shape[1]} features")
+
+    # Apply all feature engineering steps
+    df = create_lag_features(df)
+    df = create_moving_averages(df)
+    df = create_event_indicators(df)
+    df = create_calendar_features(df)
+    df = create_volatility_measures(df)
+
+    print("=" * 60)
+    print(f"Feature engineering complete!")
+    print(f"Final shape: {df.shape[0]:,} records and {df.shape[1]} features")
+    print(f"Added {df.shape[1] - 18} new features")  # Original had 18 columns
+
+    # Show feature summary
+    new_features = [col for col in df.columns if col not in [
+        'date', 'product_code', 'category', 'essential_flag', 'item_name',
+        'subcat', 'weights', 'unit_price', 'brand_name', 'grams',
+        'price_per_100g', 'normal_price', 'promo_flag', 'discount_pct',
+        'discounted_price', 'price_multiplier', 'events_active', 'month'
+    ]]
+
+    print(f"\nNew features created:")
+    feature_categories = {
+        'Lag Features': [f for f in new_features if 'lag' in f or 'change' in f],
+        'Moving Averages': [f for f in new_features if 'ma_' in f or 'dev' in f],
+        'Event Indicators': [f for f in new_features if 'has_' in f or 'event_count' in f],
+        'Calendar Features': [f for f in new_features if any(x in f for x in ['year', 'quarter', 'week', 'day', 'season', 'holiday', 'sin', 'cos'])],
+        'Volatility Measures': [f for f in new_features if 'volatility' in f or 'cv_' in f or 'rank' in f]
+    }
+
+    for category, features in feature_categories.items():
+        print(f"  • {category}: {len(features)} features")
+
+    # Check for missing values in new features
+    missing_summary = df[new_features].isnull().sum()
+    missing_features = missing_summary[missing_summary > 0]
+
+    if len(missing_features) > 0:
+        print(f"\nMissing values in new features:")
+        for feature, count in missing_features.items():
+            print(f"  • {feature}: {count:,} ({count/len(df)*100:.1f}%)")
+    else:
+        print(f"\n✓ No missing values in engineered features")
+
+    return df
+
+# %%
+# FEATURE ENGINEERING EXECUTION CELL
+# ==================================
+
+def run_feature_engineering():
+    """Execute feature engineering on the enhanced dataset"""
+
+    try:
+        # Load the enhanced dataset
+        print("Loading enhanced historical data...")
+        df = pd.read_csv('enhanced_historical_data.csv')
+        df['date'] = pd.to_datetime(df['date'])
+
+        print(f"Loaded {len(df):,} records from enhanced_historical_data.csv")
+
+        # Run feature engineering
+        df_engineered = engineer_all_features(df)
+
+        # Save the engineered dataset
+        output_file = 'grocery_data_with_features.csv'
+        df_engineered.to_csv(output_file, index=False)
+
+        print(f"\n✓ Feature-engineered dataset saved to: {output_file}")
+
+        # Quick validation
+        print(f"\nFeature Engineering Validation:")
+        print(f"  • Original features: 18")
+        print(f"  • New features: {df_engineered.shape[1] - 18}")
+        print(f"  • Total features: {df_engineered.shape[1]}")
+        print(f"  • Data completeness: {(1 - df_engineered.isnull().sum().sum() / (len(df_engineered) * len(df_engineered.columns))) * 100:.1f}%")
+
+        return df_engineered
+
+    except FileNotFoundError:
+        print("❌ enhanced_historical_data.csv not found.")
+        print("Please run the data generation first using: df = run_complete_analysis()")
+        return None
+    except Exception as e:
+        print(f"❌ Error during feature engineering: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+print("Feature engineering module loaded!")
+print("Execute: df_with_features = run_feature_engineering()")
+
+df_with_features = run_feature_engineering()
+
+# %%
+# ADVANCED MODEL ARCHITECTURE MODULE
+# ==================================
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
+from sklearn.model_selection import TimeSeriesSplit
+import warnings
+warnings.filterwarnings('ignore')
+
+# Time series modeling imports
+try:
+    from statsmodels.tsa.arima.model import ARIMA
+    from statsmodels.tsa.statespace.sarimax import SARIMAX
+    from statsmodels.tsa.vector_ar.var_model import VAR
+    from statsmodels.stats.diagnostic import acorr_ljungbox
+    from statsmodels.tsa.stattools import adfuller
+    HAS_STATSMODELS = True
+except ImportError:
+    print("⚠ statsmodels not available. Install with: pip install statsmodels")
+    HAS_STATSMODELS = False
+
+# Deep learning imports
+try:
+    import tensorflow as tf
+    from tensorflow.keras.models import Sequential
+    from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
+    from tensorflow.keras.optimizers import Adam
+    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
+    HAS_TENSORFLOW = True
+except ImportError:
+    print("⚠ TensorFlow not available. Install with: pip install tensorflow")
+    HAS_TENSORFLOW = False
+
+# %%
+# DATA PREPARATION FOR MODELING
+# =============================
+
+class ModelDataPreparator:
+    """Prepare data for different model types"""
+
+    def __init__(self, df):
+        self.df = df.copy()
+        self.scalers = {}
+
+    def prepare_arimax_data(self, category, min_weeks=52):
+        """Prepare data for ARIMAX modeling"""
+
+        # Filter category and products with sufficient history
+        cat_data = self.df[self.df['category'] == category].copy()
+
+        # FIX 3: Filter Training Data - Only use complete records
+        cat_data = cat_data[cat_data['price_lag_52'].notna()]  # Ensure 52-week history
+
+        if len(cat_data) == 0:
+            print(f" No data available for {category} with 52-week history")
+            return None
+
+        # Aggregate to category level (weekly averages)
+        weekly_data = cat_data.groupby('date').agg({
+            'discounted_price': 'mean',
+            'promo_flag': 'mean',
+            'has_holiday_event': 'max',
+            'has_seasonal_event': 'max',
+            'has_weather_event': 'max',
+            'has_supply_chain_event': 'max',
+            'school_holiday_flag': 'max',
+            'month_sin': 'first',
+            'month_cos': 'first',
+            'event_count': 'mean'
+        }).reset_index().sort_values('date')
+
+        return weekly_data
+
+    def prepare_lstm_data(self, category, sequence_length=12, test_size=0.2):
+        """Prepare data for LSTM modeling with sequences"""
+
+        cat_data = self.df[self.df['category'] == category].copy()
+
+        # FIX 3: Filter Training Data - Only use complete records
+        cat_data = cat_data[cat_data['price_lag_52'].notna()]
+
+        if len(cat_data) == 0:
+            return None, None, None, None
+
+        # Select features for LSTM
+        feature_cols = [
+            'discounted_price', 'price_lag_1', 'price_lag_7', 'ma_7d', 'ma_30d',
+            'promo_flag', 'has_holiday_event', 'has_seasonal_event',
+            'month_sin', 'month_cos', 'volatility_4w'
+        ]
+
+        # Aggregate to weekly category level
+        weekly_data = cat_data.groupby('date')[feature_cols].mean().reset_index()
+        weekly_data = weekly_data.sort_values('date')
+
+        # Scale features
+        scaler = MinMaxScaler()
+        scaled_features = scaler.fit_transform(weekly_data[feature_cols])
+        self.scalers[f'{category}_lstm'] = scaler
+
+        # Create sequences
+        X, y = [], []
+        for i in range(sequence_length, len(scaled_features)):
+            X.append(scaled_features[i-sequence_length:i])  # Past sequence_length weeks
+            y.append(scaled_features[i, 0])  # Current price (first column)
+
+        X, y = np.array(X), np.array(y)
+
+        # Train/test split (chronological)
+        split_idx = int(len(X) * (1 - test_size))
+        X_train, X_test = X[:split_idx], X[split_idx:]
+        y_train, y_test = y[:split_idx], y[split_idx:]
+
+        return X_train, X_test, y_train, y_test
+
+    def prepare_var_data(self, related_categories, min_weeks=52):
+        """Prepare data for Vector Autoregression (related categories)"""
+
+        # Filter data for related categories
+        related_data = self.df[self.df['category'].isin(related_categories)].copy()
+        related_data = related_data[related_data['price_lag_52'].notna()]
+
+        # Create wide format (categories as columns)
+        pivot_data = related_data.groupby(['date', 'category'])['discounted_price'].mean().unstack()
+        pivot_data = pivot_data.dropna()  # Remove rows with missing categories
+
+        if len(pivot_data) < min_weeks:
+            print(f" Insufficient data for VAR model: {len(pivot_data)} weeks")
+            return None
+
+        return pivot_data
+# %%
+# ARIMAX MODEL IMPLEMENTATION
+# ===========================
+
+class ARIMAXModel:
+    """ARIMAX model with external regressors"""
+
+    def __init__(self, order=(1,1,1), seasonal_order=(1,1,1,52)):
+        self.order = order
+        self.seasonal_order = seasonal_order
+        self.model = None
+        self.fitted_model = None
+
+    def fit(self, data, target_col='discounted_price', exog_cols=None):
+        """Fit ARIMAX model"""
+
+        if not HAS_STATSMODELS:
+            print(" statsmodels required for ARIMAX")
+            return False
+
+        try:
+            y = data[target_col]
+            exog = data[exog_cols] if exog_cols else None
+
+            # FIX 2: Address Stationarity - Apply differencing if needed
+            adf_result = adfuller(y.dropna())
+            print(f"Stationarity test p-value: {adf_result[1]:.4f}")
+
+            if adf_result[1] > 0.05:
+                print("⚠ Series not stationary, applying first differencing")
+                y_diff = y.diff().dropna()
+
+                # Adjust exogenous variables to match differenced series
+                if exog is not None:
+                    exog = exog.iloc[1:]  # Remove first row to match differenced series
+
+                # Update order to (1,0,1) since we manually differenced
+                model_order = (self.order[0], 0, self.order[2])
+                y_model = y_diff
+            else:
+                print("✓ Series is stationary")
+                model_order = self.order
+                y_model = y
+
+            # Fit SARIMAX model
+            self.model = SARIMAX(
+                y_model,
+                exog=exog,
+                order=model_order,
+                seasonal_order=self.seasonal_order,
+                enforce_stationarity=False,
+                enforce_invertibility=False
+            )
+
+            self.fitted_model = self.model.fit(disp=False, maxiter=200)
+
+            print(f"✓ ARIMAX model fitted successfully")
+            return True
+
+        except Exception as e:
+            print(f" Error fitting ARIMAX: {str(e)}")
+            return False
+
+    def forecast(self, steps, exog=None):
+        """Generate forecasts"""
+        if self.fitted_model is None:
+            print(" Model not fitted")
+            return None
+
+        try:
+            forecast = self.fitted_model.forecast(steps=steps, exog=exog)
+            conf_int = self.fitted_model.get_forecast(steps=steps, exog=exog).conf_int()
+
+            return {
+                'forecast': forecast,
+                'conf_int': conf_int,
+                'model_summary': self.fitted_model.summary()
+            }
+        except Exception as e:
+            print(f" Error forecasting: {str(e)}")
+            return None
+
+    def get_diagnostics(self):
+        """Get model diagnostics"""
+        if self.fitted_model is None:
+            return None
+
+        residuals = self.fitted_model.resid
+
+        # Ljung-Box test for autocorrelation in residuals
+        lb_test = acorr_ljungbox(residuals, lags=10, return_df=True)
+
+        return {
+            'aic': self.fitted_model.aic,
+            'bic': self.fitted_model.bic,
+            'ljung_box': lb_test,
+            'residuals': residuals
+        }
+
+# %%
+# LSTM MODEL IMPLEMENTATION
+# ========================
+
+class LSTMModel:
+    """LSTM model for non-linear time series patterns"""
+
+    def __init__(self, lstm_units=50, dropout=0.2, dense_units=25):
+        self.lstm_units = lstm_units
+        self.dropout = dropout
+        self.dense_units = dense_units
+        self.model = None
+        self.history = None
+
+    def build_model(self, input_shape):
+        """Build LSTM architecture"""
+
+        if not HAS_TENSORFLOW:
+            print(" TensorFlow required for LSTM")
+            return False
+
+        self.model = Sequential([
+            Input(shape=input_shape),
+            LSTM(self.lstm_units, return_sequences=True, dropout=self.dropout),
+            LSTM(self.lstm_units//2, dropout=self.dropout),
+            Dense(self.dense_units, activation='relu'),
+            Dropout(self.dropout),
+            Dense(1)
+        ])
+
+        self.model.compile(
+            optimizer=Adam(learning_rate=0.001),
+            loss='mse',
+            metrics=['mae']
+        )
+
+        return True
+
+    def fit(self, X_train, y_train, X_val=None, y_val=None, epochs=100, batch_size=32):
+        """Train LSTM model"""
+
+        if self.model is None:
+            print(" Model not built")
+            return False
+
+        # Callbacks
+        callbacks = [
+            EarlyStopping(patience=15, restore_best_weights=True),
+            ReduceLROnPlateau(patience=8, factor=0.5, min_lr=1e-7)
+        ]
+
+        # Validation data
+        validation_data = (X_val, y_val) if X_val is not None else None
+
+        try:
+            self.history = self.model.fit(
+                X_train, y_train,
+                validation_data=validation_data,
+                epochs=epochs,
+                batch_size=batch_size,
+                callbacks=callbacks,
+                verbose=0
+            )
+
+            print(f"✓ LSTM model trained for {len(self.history.history['loss'])} epochs")
+            return True
+
+        except Exception as e:
+            print(f" Error training LSTM: {str(e)}")
+            return False
+
+    def predict(self, X):
+        """Generate predictions"""
+        if self.model is None:
+            print(" Model not trained")
+            return None
+
+        return self.model.predict(X, verbose=0)
+
+    def get_training_history(self):
+        """Get training history"""
+        if self.history is None:
+            return None
+
+        return {
+            'loss': self.history.history['loss'],
+            'val_loss': self.history.history.get('val_loss', []),
+            'mae': self.history.history['mae'],
+            'val_mae': self.history.history.get('val_mae', [])
+        }
+
+# %%
+# ENSEMBLE MODEL IMPLEMENTATION
+# =============================
+
+class EnsembleModel:
+    """Ensemble combining ARIMA and LSTM predictions"""
+
+    def __init__(self, arimax_weight=0.6, lstm_weight=0.4):
+        self.arimax_weight = arimax_weight
+        self.lstm_weight = lstm_weight
+        self.arimax_model = None
+        self.lstm_model = None
+
+    def add_models(self, arimax_model, lstm_model):
+        """Add component models"""
+        self.arimax_model = arimax_model
+        self.lstm_model = lstm_model
+
+    def predict(self, arimax_data, lstm_data, steps=1):
+        """Generate ensemble predictions"""
+
+        if self.arimax_model is None or self.lstm_model is None:
+            print(" Component models not provided")
+            return None
+
+        try:
+            # ARIMAX predictions
+            arimax_forecast = self.arimax_model.forecast(steps=steps, exog=arimax_data)
+            arimax_pred = arimax_forecast['forecast'] if isinstance(arimax_forecast, dict) else arimax_forecast
+
+            # LSTM predictions
+            lstm_pred = self.lstm_model.predict(lstm_data)
+
+            # Combine predictions
+            ensemble_pred = (self.arimax_weight * arimax_pred +
+                           self.lstm_weight * lstm_pred.flatten())
+
+            return {
+                'ensemble': ensemble_pred,
+                'arimax': arimax_pred,
+                'lstm': lstm_pred.flatten(),
+                'weights': {'arimax': self.arimax_weight, 'lstm': self.lstm_weight}
+            }
+
+        except Exception as e:
+            print(f" Error in ensemble prediction: {str(e)}")
+            return None
+
+# %%
+# VAR MODEL IMPLEMENTATION
+# =======================
+
+class VARModel:
+    """Vector Autoregression for related product categories"""
+
+    def __init__(self, maxlags=12):
+        self.maxlags = maxlags
+        self.model = None
+        self.fitted_model = None
+
+    def fit(self, data):
+        """Fit VAR model"""
+
+        if not HAS_STATSMODELS:
+            print(" statsmodels required for VAR")
+            return False
+
+        try:
+            # Select optimal lag order
+            var_model = VAR(data)
+            lag_order = var_model.select_order(maxlags=self.maxlags)
+            optimal_lags = lag_order.aic  # Use AIC for selection
+
+            print(f"Optimal lag order: {optimal_lags}")
+
+            # Fit VAR model
+            self.fitted_model = var_model.fit(optimal_lags)
+
+            print(f"✓ VAR model fitted with {optimal_lags} lags")
+            return True
+
+        except Exception as e:
+            print(f" Error fitting VAR: {str(e)}")
+            return False
+
+    def forecast(self, steps):
+        """Generate VAR forecasts"""
+        if self.fitted_model is None:
+            print(" Model not fitted")
+            return None
+
+        try:
+            forecast = self.fitted_model.forecast(
+                self.fitted_model.y,
+                steps=steps
+            )
+
+            return {
+                'forecast': forecast,
+                'model_summary': self.fitted_model.summary()
+            }
+
+        except Exception as e:
+            print(f" Error forecasting: {str(e)}")
+            return None
+
+# %%
+# MODEL EVALUATION FRAMEWORK
+# =========================
+
+class ModelEvaluator:
+    """Comprehensive model evaluation"""
+
+    @staticmethod
+    def calculate_metrics(y_true, y_pred):
+        """Calculate evaluation metrics"""
+
+        # Remove any NaN values
+        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
+        y_true_clean = y_true[mask]
+        y_pred_clean = y_pred[mask]
+
+        if len(y_true_clean) == 0:
+            return None
+
+        return {
+            'MAE': mean_absolute_error(y_true_clean, y_pred_clean),
+            'RMSE': np.sqrt(mean_squared_error(y_true_clean, y_pred_clean)),
+            'MAPE': mean_absolute_percentage_error(y_true_clean, y_pred_clean) * 100,
+            'R²': 1 - (np.sum((y_true_clean - y_pred_clean)**2) /
+                       np.sum((y_true_clean - np.mean(y_true_clean))**2))
+        }
+
+    @staticmethod
+    def compare_models(results_dict):
+        """Compare multiple models"""
+
+        comparison = pd.DataFrame({
+            model: metrics for model, metrics in results_dict.items()
+        }).T
+
+        # Rank models (lower is better for MAE, RMSE, MAPE; higher for R²)
+        comparison['MAE_rank'] = comparison['MAE'].rank()
+        comparison['RMSE_rank'] = comparison['RMSE'].rank()
+        comparison['MAPE_rank'] = comparison['MAPE'].rank()
+        comparison['R²_rank'] = comparison['R²'].rank(ascending=False)
+
+        comparison['avg_rank'] = (comparison['MAE_rank'] + comparison['RMSE_rank'] +
+                                 comparison['MAPE_rank'] + comparison['R²_rank']) / 4
+
+        return comparison.sort_values('avg_rank')
+
+print("Advanced modeling architecture loaded!")
+print("Dependencies available:")
+print(f"  • statsmodels (ARIMAX/VAR): {HAS_STATSMODELS}")
+print(f"  • tensorflow (LSTM): {HAS_TENSORFLOW}")
+print("\nNext: Load your feature-engineered data and run model pipeline")
+
+# %%
+# COMPLETE MODEL PIPELINE EXECUTION
+# =================================
+
+def run_advanced_model_pipeline(target_categories=None, forecast_horizon=12):
+    """
+    Execute complete advanced modeling pipeline
+
+    Parameters:
+    - target_categories: List of categories to model (default: top 3 stable categories)
+    - forecast_horizon: Number of weeks to forecast ahead
+    """
+
+    print("=" * 80)
+    print("ADVANCED MODEL PIPELINE EXECUTION")
+    print("=" * 80)
+
+    try:
+        # Load feature-engineered data
+        print("Loading feature-engineered dataset...")
+        df = pd.read_csv('grocery_data_with_features.csv')
+        df['date'] = pd.to_datetime(df['date'])
+
+        print(f"Loaded {len(df):,} records with {df.shape[1]} features")
+
+        # Select target categories (stable ones for initial modeling)
+        if target_categories is None:
+            # Choose stable categories with good data coverage
+            category_stability = df.groupby('category').agg({
+                'discounted_price': ['count', 'std'],
+                'price_multiplier': 'mean'
+            }).round(3)
+
+            category_stability.columns = ['record_count', 'price_std', 'avg_multiplier']
+            category_stability['stability_score'] = (
+                category_stability['record_count'] / category_stability['price_std']
+            )
+
+            # Select top 3 most stable categories
+            target_categories = category_stability.nlargest(3, 'stability_score').index.tolist()
+
+        print(f"Target categories for modeling: {target_categories}")
+
+        # Initialize data preparator
+        preparator = ModelDataPreparator(df)
+
+        # Store all results
+        all_results = {}
+
+        # Process each category
+        for category in target_categories:
+            print(f"\n" + "="*60)
+            print(f"MODELING CATEGORY: {category}")
+            print("="*60)
+
+            category_results = model_single_category(
+                preparator, category, forecast_horizon
+            )
+
+            if category_results:
+                all_results[category] = category_results
+                print(f"✓ {category} modeling completed successfully")
+            else:
+                print(f" {category} modeling failed")
+
+        # Cross-category VAR modeling
+        print(f"\n" + "="*60)
+        print("VECTOR AUTOREGRESSION (VAR) MODELING")
+        print("="*60)
+
+        var_results = run_var_modeling(preparator, target_categories)
+        if var_results:
+            all_results['VAR_cross_category'] = var_results
+
+        # Generate comprehensive report
+        print(f"\n" + "="*60)
+        print("MODEL PERFORMANCE SUMMARY")
+        print("="*60)
+
+        generate_model_report(all_results)
+
+        print(f"\n" + "="*80)
+        print("PIPELINE EXECUTION COMPLETE")
+        print("="*80)
+
+        return all_results
+
+    except Exception as e:
+        print(f" Pipeline execution failed: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+def model_single_category(preparator, category, forecast_horizon):
+    """Model a single category with all approaches"""
+
+    results = {}
+
+    # 1. ARIMAX Modeling
+    print(f"\n1. ARIMAX MODELING FOR {category}")
+    print("-" * 40)
+
+    arimax_data = preparator.prepare_arimax_data(category)
+    if arimax_data is not None and len(arimax_data) >= 52:
+
+        # Define external regressors
+        exog_cols = [
+            'promo_flag', 'has_holiday_event', 'has_seasonal_event',
+            'school_holiday_flag', 'month_sin', 'month_cos'
+        ]
+
+        # Train/test split (80/20)
+        split_idx = int(len(arimax_data) * 0.8)
+        train_data = arimax_data.iloc[:split_idx]
+        test_data = arimax_data.iloc[split_idx:]
+
+        # Fit ARIMAX model
+        arimax_model = ARIMAXModel(order=(1,1,1), seasonal_order=(1,1,1,52))
+
+        if arimax_model.fit(train_data, 'discounted_price', exog_cols):
+
+            # Generate forecasts
+            test_exog = test_data[exog_cols] if len(test_data) > 0 else None
+            forecast_result = arimax_model.forecast(len(test_data), test_exog)
+
+            if forecast_result:
+                # Evaluate performance
+                if len(test_data) > 0:
+                    y_true = test_data['discounted_price'].values
+                    y_pred = forecast_result['forecast'].values
+
+                    arimax_metrics = ModelEvaluator.calculate_metrics(y_true, y_pred)
+
+                    results['ARIMAX'] = {
+                        'model': arimax_model,
+                        'metrics': arimax_metrics,
+                        'forecast': forecast_result,
+                        'diagnostics': arimax_model.get_diagnostics()
+                    }
+
+                    print(f"ARIMAX Metrics: MAE={arimax_metrics['MAE']:.3f}, "
+                          f"RMSE={arimax_metrics['RMSE']:.3f}, "
+                          f"MAPE={arimax_metrics['MAPE']:.1f}%")
+    else:
+        print(f" Insufficient data for ARIMAX modeling")
+
+    # 2. LSTM Modeling
+    print(f"\n2. LSTM MODELING FOR {category}")
+    print("-" * 40)
+
+    X_train, X_test, y_train, y_test = preparator.prepare_lstm_data(category)
+
+    if X_train is not None and len(X_train) > 20:
+
+        # Build and train LSTM
+        lstm_model = LSTMModel(lstm_units=50, dropout=0.2)
+
+        if lstm_model.build_model(X_train.shape[1:]):
+
+            # Split training data for validation
+            val_split = int(len(X_train) * 0.8)
+            X_train_split = X_train[:val_split]
+            X_val_split = X_train[val_split:]
+            y_train_split = y_train[:val_split]
+            y_val_split = y_train[val_split:]
+
+            if lstm_model.fit(X_train_split, y_train_split, X_val_split, y_val_split, epochs=50):
+
+                # Generate predictions
+                if len(X_test) > 0:
+                    y_pred_scaled = lstm_model.predict(X_test)
+
+                    # FIX 1: Fix Data Scaling - Inverse transform predictions
+                    scaler = preparator.scalers[f'{category}_lstm']
+
+                    # Create dummy array for inverse transform (scaler expects all features)
+                    dummy_array = np.zeros((len(y_pred_scaled), scaler.n_features_in_))
+                    dummy_array[:, 0] = y_pred_scaled.flatten()  # Price is first feature
+
+                    # Inverse transform to get actual price scale
+                    y_pred_actual = scaler.inverse_transform(dummy_array)[:, 0]
+
+                    # Also inverse transform actual values for fair comparison
+                    dummy_array_actual = np.zeros((len(y_test), scaler.n_features_in_))
+                    dummy_array_actual[:, 0] = y_test
+                    y_test_actual = scaler.inverse_transform(dummy_array_actual)[:, 0]
+
+                    lstm_metrics = ModelEvaluator.calculate_metrics(y_test_actual, y_pred_actual)
+
+                    results['LSTM'] = {
+                        'model': lstm_model,
+                        'metrics': lstm_metrics,
+                        'training_history': lstm_model.get_training_history(),
+                        'scaler': scaler  # Store scaler for future use
+                    }
+
+                    print(f"LSTM Metrics: MAE={lstm_metrics['MAE']:.3f}, "
+                          f"RMSE={lstm_metrics['RMSE']:.3f}, "
+                          f"MAPE={lstm_metrics['MAPE']:.1f}%")
+    else:
+        print(f" Insufficient data for LSTM modeling")
+
+    # 3. Ensemble Modeling
+    if 'ARIMAX' in results and 'LSTM' in results:
+        print(f"\n3. ENSEMBLE MODELING FOR {category}")
+        print("-" * 40)
+
+        # Create ensemble
+        ensemble = EnsembleModel(arimax_weight=0.6, lstm_weight=0.4)
+        ensemble.add_models(results['ARIMAX']['model'], results['LSTM']['model'])
+
+        # Note: Ensemble prediction would require aligned data
+        # This is a simplified implementation
+        arimax_pred = results['ARIMAX']['forecast']['forecast'].values
+        lstm_pred = results['LSTM']['metrics']  # Placeholder
+
+        print("✓ Ensemble model created (weights: 60% ARIMAX, 40% LSTM)")
+        results['Ensemble'] = {'model': ensemble, 'weights': {'ARIMAX': 0.6, 'LSTM': 0.4}}
+
+    return results
+
+def run_var_modeling(preparator, categories):
+    """Run Vector Autoregression on related categories"""
+
+    # Group related categories
+    related_groups = [
+        ['Meat & Seafood', 'Frozen'],  # Protein products
+        ['Dairy, Eggs & Fridge', 'Bakery'],  # Fresh products
+        ['Pantry', 'Household']  # Shelf-stable products
+    ]
+
+    var_results = {}
+
+    for group in related_groups:
+        available_categories = [cat for cat in group if cat in categories]
+
+        if len(available_categories) >= 2:
+            print(f"VAR modeling for: {available_categories}")
+
+            var_data = preparator.prepare_var_data(available_categories)
+
+            if var_data is not None and len(var_data) >= 52:
+
+                # Train/test split
+                split_idx = int(len(var_data) * 0.8)
+                train_data = var_data.iloc[:split_idx]
+                test_data = var_data.iloc[split_idx:]
+
+                # Fit VAR model
+                var_model = VARModel(maxlags=8)
+
+                if var_model.fit(train_data):
+
+                    # Generate forecasts
+                    forecast_result = var_model.forecast(len(test_data))
+
+                    if forecast_result:
+                        var_results[f"VAR_{'_'.join(available_categories)}"] = {
+                            'model': var_model,
+                            'forecast': forecast_result,
+                            'categories': available_categories
+                        }
+
+                        print(f"✓ VAR model fitted for {available_categories}")
+
+    return var_results if var_results else None
+
+def generate_model_report(all_results):
+    """Generate comprehensive model performance report"""
+
+    if not all_results:
+        print("No results to report")
+        return
+
+    print("MODEL PERFORMANCE COMPARISON:")
+    print("=" * 50)
+
+    # Collect metrics for comparison
+    metrics_comparison = {}
+
+    for category, models in all_results.items():
+        if category.startswith('VAR'):
+            continue  # Skip VAR for metrics comparison
+
+        print(f"\nCategory: {category}")
+        print("-" * 30)
+
+        for model_name, model_data in models.items():
+            if 'metrics' in model_data and model_data['metrics']:
+                metrics = model_data['metrics']
+                metrics_comparison[f"{category}_{model_name}"] = metrics
+
+                print(f"{model_name:>8}: MAE={metrics['MAE']:.3f}, "
+                      f"RMSE={metrics['RMSE']:.3f}, "
+                      f"MAPE={metrics['MAPE']:.1f}%, "
+                      f"R²={metrics['R²']:.3f}")
+
+    # Overall comparison
+    if metrics_comparison:
+        print(f"\nOVERALL BEST PERFORMING MODELS:")
+        print("-" * 40)
+
+        comparison_df = ModelEvaluator.compare_models(metrics_comparison)
+
+        print("Top 5 models by average rank:")
+        for idx, (model, row) in enumerate(comparison_df.head().iterrows()):
+            print(f"{idx+1}. {model}: avg_rank={row['avg_rank']:.2f}")
+
+    # Save results summary
+    results_summary = {
+        'categories_modeled': len([k for k in all_results.keys() if not k.startswith('VAR')]),
+        'models_trained': sum(len(v) for k, v in all_results.items() if not k.startswith('VAR')),
+        'metrics_comparison': metrics_comparison
+    }
+
+    print(f"\nSUMMARY:")
+    print(f"  • Categories modeled: {results_summary['categories_modeled']}")
+    print(f"  • Total models trained: {results_summary['models_trained']}")
+    print(f"  • VAR models: {len([k for k in all_results.keys() if k.startswith('VAR')])}")
+
+# %%
+# EXECUTION CELL - RUN COMPLETE PIPELINE
+# ======================================
+
+print("Advanced modeling pipeline ready!")
+print("\nTo execute the complete pipeline:")
+print("results = run_advanced_model_pipeline()")
+print("\nOr specify custom categories:")
+print("results = run_advanced_model_pipeline(['Pantry', 'Bakery', 'Dairy, Eggs & Fridge'])")
+print("\nThis will run:")
+print("  • ARIMAX models with external regressors")
+print("  • LSTM networks for non-linear patterns")
+print("  • Ensemble combinations")
+print("  • VAR models for cross-category relationships")
+print("  • Comprehensive evaluation and comparison")
+
+results = run_advanced_model_pipeline()
+
+# Notebook-Based Forecasting Pipeline with Direct Output
+# =====================================================
+
+import numpy as np
+import pandas as pd
+import warnings
+from datetime import datetime, timedelta
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
+import matplotlib.pyplot as plt
+import seaborn as sns
+from typing import Dict, List, Tuple, Optional
+warnings.filterwarnings('ignore')
+
+# Deep learning imports
+try:
+    import tensorflow as tf
+    from tensorflow.keras.models import Sequential
+    from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
+    from tensorflow.keras.optimizers import Adam
+    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
+    HAS_TENSORFLOW = True
+except ImportError:
+    print("⚠ TensorFlow not available. Install with: pip install tensorflow")
+    HAS_TENSORFLOW = False
+
+# Set plotting style
+plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default')
+sns.set_palette("husl")
+
+# ========================================================
+# 1. WALK-FORWARD VALIDATION FRAMEWORK (NOTEBOOK VERSION)
+# ========================================================
+
+class NotebookWalkForwardValidator:
+    """Walk-forward validator with direct notebook output"""
+
+    def __init__(self, min_train_size=52, test_size=4, step_size=4, sequence_length=12):
+        self.min_train_size = min_train_size
+        self.test_size = test_size
+        self.step_size = step_size
+        self.sequence_length = sequence_length
+
+    def create_validation_splits(self, data, date_col='date'):
+        """Create walk-forward validation time splits"""
+        data = data.sort_values(date_col).reset_index(drop=True)
+        splits = []
+
+        total_weeks = len(data)
+        current_train_end = self.min_train_size
+
+        while current_train_end + self.test_size <= total_weeks:
+            train_indices = list(range(0, current_train_end))
+            test_indices = list(range(current_train_end, current_train_end + self.test_size))
+
+            splits.append({
+                'train_idx': train_indices,
+                'test_idx': test_indices,
+                'train_end_date': data.iloc[current_train_end - 1][date_col],
+                'test_start_date': data.iloc[current_train_end][date_col],
+                'test_end_date': data.iloc[current_train_end + self.test_size - 1][date_col]
+            })
+
+            current_train_end += self.step_size
+
+        return splits
+
+    def validate_lstm_model(self, data, category):
+        """Run walk-forward validation for LSTM models with notebook output"""
+
+        print(f"WALK-FORWARD VALIDATION: {category}")
+        print("=" * 60)
+
+        # Filter and prepare data
+        cat_data = data[data['category'] == category].copy()
+        cat_data = cat_data[cat_data['price_lag_52'].notna()]
+
+        if len(cat_data) < self.min_train_size + self.test_size:
+            print(f"Insufficient data for {category}: {len(cat_data)} records")
+            print(f"Need at least {self.min_train_size + self.test_size} records for validation")
+            return None
+
+        # Feature selection
+        feature_cols = [
+            'discounted_price', 'price_lag_1', 'price_lag_7', 'ma_7d', 'ma_30d',
+            'promo_flag', 'has_holiday_event', 'has_seasonal_event',
+            'month_sin', 'month_cos', 'volatility_4w'
+        ]
+
+        # Aggregate to weekly level
+        weekly_data = cat_data.groupby('date')[feature_cols].mean().reset_index()
+        weekly_data = weekly_data.sort_values('date')
+
+        print(f"Data Summary:")
+        print(f"   • Total weeks: {len(weekly_data)}")
+        print(f"   • Date range: {weekly_data['date'].min().strftime('%Y-%m-%d')} to {weekly_data['date'].max().strftime('%Y-%m-%d')}")
+
+        # Create validation splits
+        splits = self.create_validation_splits(weekly_data)
+
+        if len(splits) == 0:
+            print(f"Cannot create validation splits for {category}")
+            return None
+
+        print(f"   • Validation splits: {len(splits)}")
+
+        # Run validation
+        validation_results = []
+        all_predictions = []
+        all_actuals = []
+        all_dates = []
+
+        successful_splits = 0
+
+        for i, split in enumerate(splits):
+            print(f"\nSplit {i+1}/{len(splits)}: {split['test_start_date'].strftime('%Y-%m-%d')} to {split['test_end_date'].strftime('%Y-%m-%d')}")
+
+            # Prepare data
+            train_data = weekly_data.iloc[split['train_idx']]
+            test_data = weekly_data.iloc[split['test_idx']]
+
+            # Check minimum training data requirement
+            if len(train_data) < self.sequence_length + 10:
+                print(f"   Insufficient training data: {len(train_data)} weeks")
+                continue
+
+            # Scale data
+            scaler = MinMaxScaler()
+            train_scaled = scaler.fit_transform(train_data[feature_cols])
+            test_scaled = scaler.transform(test_data[feature_cols])
+
+            # Create sequences
+            X_train, y_train = self._create_sequences(train_scaled, self.sequence_length)
+            X_test, y_test = self._create_sequences(test_scaled, self.sequence_length)
+
+            if len(X_train) < 10:
+                print(f"   Insufficient training sequences: {len(X_train)}")
+                continue
+
+            if len(X_test) == 0:
+                print(f"   No test sequences available")
+                continue
+
+            # Build and train LSTM
+            model = self._build_lstm_model(X_train.shape[1:])
+
+            # Train model
+            try:
+                history = model.fit(
+                    X_train, y_train,
+                    epochs=30,
+                    batch_size=min(16, len(X_train)//2),
+                    validation_split=0.2 if len(X_train) > 5 else 0,
+                    callbacks=[
+                        EarlyStopping(patience=5, restore_best_weights=True),
+                        ReduceLROnPlateau(patience=3, factor=0.5)
+                    ],
+                    verbose=0
+                )
+            except Exception as e:
+                print(f"   Training failed: {str(e)}")
+                continue
+
+            # Generate predictions
+            try:
+                y_pred_scaled = model.predict(X_test, verbose=0)
+
+                # Inverse transform
+                dummy_array = np.zeros((len(y_pred_scaled), len(feature_cols)))
+                dummy_array[:, 0] = y_pred_scaled.flatten()
+                y_pred_actual = scaler.inverse_transform(dummy_array)[:, 0]
+
+                dummy_array_actual = np.zeros((len(y_test), len(feature_cols)))
+                dummy_array_actual[:, 0] = y_test
+                y_test_actual = scaler.inverse_transform(dummy_array_actual)[:, 0]
+
+                # Calculate metrics
+                metrics = self._calculate_metrics(y_test_actual, y_pred_actual)
+                market_conditions = self._identify_market_conditions(test_data)
+
+                # Store results
+                validation_results.append({
+                    'split_id': i,
+                    'test_period': f"{split['test_start_date'].strftime('%Y-%m-%d')} to {split['test_end_date'].strftime('%Y-%m-%d')}",
+                    'metrics': metrics,
+                    'market_conditions': market_conditions
+                })
+
+                # Store for plotting
+                test_dates = test_data['date'].iloc[self.sequence_length:].values
+                if len(test_dates) == len(y_pred_actual):
+                    all_predictions.extend(y_pred_actual)
+                    all_actuals.extend(y_test_actual)
+                    all_dates.extend(test_dates)
+
+                successful_splits += 1
+                print(f"   MAE: {metrics['MAE']:.3f} | RMSE: {metrics['RMSE']:.3f} | MAPE: {metrics['MAPE']:.1f}%")
+                print(f"   Conditions: {', '.join(market_conditions)}")
+
+            except Exception as e:
+                print(f"   Prediction failed: {str(e)}")
+                continue
+
+        if successful_splits == 0:
+            print(f"\nNo successful validation splits for {category}")
+            return None
+
+        print(f"\nSuccessful validation splits: {successful_splits}/{len(splits)}")
+
+        # Display summary results
+        self._display_validation_summary(validation_results, category)
+
+        # Create validation plots if we have data
+        if len(all_predictions) > 0 and len(all_actuals) > 0:
+            self._create_validation_plots(all_dates, all_actuals, all_predictions, category)
+        else:
+            print(f"   Insufficient data for validation plots")
+
+        return validation_results
+
+    def _create_sequences(self, data, sequence_length):
+        """Create sequences for LSTM training"""
+        X, y = [], []
+        for i in range(sequence_length, len(data)):
+            X.append(data[i-sequence_length:i])
+            y.append(data[i, 0])
+        return np.array(X), np.array(y)
+
+    def _build_lstm_model(self, input_shape):
+        """Build LSTM model"""
+        model = Sequential([
+            Input(shape=input_shape),
+            LSTM(50, return_sequences=True, dropout=0.2),
+            LSTM(25, dropout=0.2),
+            Dense(16, activation='relu'),
+            Dropout(0.2),
+            Dense(1)
+        ])
+
+        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
+        return model
+
+    def _calculate_metrics(self, y_true, y_pred):
+        """Calculate standard forecasting metrics"""
+        return {
+            'MAE': mean_absolute_error(y_true, y_pred),
+            'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
+            'MAPE': mean_absolute_percentage_error(y_true, y_pred) * 100,
+            'R²': 1 - (np.sum((y_true - y_pred)**2) / np.sum((y_true - np.mean(y_true))**2))
+        }
+
+    def _identify_market_conditions(self, test_data):
+        """Identify market conditions during test period"""
+        conditions = []
+
+        if 'promo_flag' in test_data.columns:
+            promo_rate = test_data['promo_flag'].mean()
+            if promo_rate > 0.3:
+                conditions.append('high_promo')
+            elif promo_rate > 0.1:
+                conditions.append('medium_promo')
+            else:
+                conditions.append('low_promo')
+
+        if 'has_holiday_event' in test_data.columns and test_data['has_holiday_event'].any():
+            conditions.append('holiday')
+
+        if 'season' in test_data.columns:
+            season = test_data['season'].mode()[0] if len(test_data['season'].mode()) > 0 else 'unknown'
+            conditions.append(season)
+
+        return conditions
+
+    def _display_validation_summary(self, results, category):
+        """Display validation summary statistics"""
+        if not results:
+            return
+
+        print(f"\n VALIDATION SUMMARY: {category}")
+        print("=" * 50)
+
+        mae_scores = [r['metrics']['MAE'] for r in results]
+        mape_scores = [r['metrics']['MAPE'] for r in results]
+        rmse_scores = [r['metrics']['RMSE'] for r in results]
+
+        print(f" Performance Metrics:")
+        print(f"   • Number of validation periods: {len(results)}")
+        print(f"   • Average MAE: {np.mean(mae_scores):.3f} ± {np.std(mae_scores):.3f}")
+        print(f"   • Average MAPE: {np.mean(mape_scores):.1f}% ± {np.std(mape_scores):.1f}%")
+        print(f"   • Average RMSE: {np.mean(rmse_scores):.3f} ± {np.std(rmse_scores):.3f}")
+        print(f"   • Best MAE: {min(mae_scores):.3f}")
+        print(f"   • Worst MAE: {max(mae_scores):.3f}")
+
+        # Performance by market conditions
+        condition_performance = {}
+        for result in results:
+            for condition in result['market_conditions']:
+                if condition not in condition_performance:
+                    condition_performance[condition] = []
+                condition_performance[condition].append(result['metrics']['MAE'])
+
+        print(f"\n  Performance by Market Conditions:")
+        for condition, maes in condition_performance.items():
+            print(f"   • {condition}: {np.mean(maes):.3f} MAE ({len(maes)} periods)")
+
+    def _create_validation_plots(self, dates, actuals, predictions, category):
+        """Create validation visualization plots"""
+
+        # Check if we have enough data for plotting
+        if len(dates) == 0 or len(actuals) == 0 or len(predictions) == 0:
+            print(f"   ⚠ Insufficient data for plotting {category} validation results")
+            return
+
+        # Ensure all arrays have the same length
+        min_length = min(len(dates), len(actuals), len(predictions))
+        dates = dates[:min_length]
+        actuals = actuals[:min_length]
+        predictions = predictions[:min_length]
+
+        # Convert to numpy arrays for safety
+        actuals = np.array(actuals)
+        predictions = np.array(predictions)
+
+        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+        fig.suptitle(f'Walk-Forward Validation Results: {category}', fontsize=16, fontweight='bold')
+
+        # Convert dates to pandas datetime if needed
+        if len(dates) > 0:
+            dates = pd.to_datetime(dates)
+
+        # Plot 1: Actual vs Predicted over time
+        axes[0, 0].plot(dates, actuals, label='Actual', alpha=0.7, linewidth=2)
+        axes[0, 0].plot(dates, predictions, label='Predicted', alpha=0.7, linewidth=2)
+        axes[0, 0].set_title('Actual vs Predicted Prices Over Time')
+        axes[0, 0].set_xlabel('Date')
+        axes[0, 0].set_ylabel('Price ($)')
+        axes[0, 0].legend()
+        axes[0, 0].grid(True, alpha=0.3)
+
+        # Plot 2: Prediction errors
+        errors = predictions - actuals
+        axes[0, 1].plot(dates, errors, color='red', alpha=0.7)
+        axes[0, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
+        axes[0, 1].set_title('Prediction Errors Over Time')
+        axes[0, 1].set_xlabel('Date')
+        axes[0, 1].set_ylabel('Error ($)')
+        axes[0, 1].grid(True, alpha=0.3)
+
+        # Plot 3: Scatter plot
+        axes[1, 0].scatter(actuals, predictions, alpha=0.6)
+
+        # Safe min/max calculation
+        if len(actuals) > 0 and len(predictions) > 0:
+            min_val = min(np.min(actuals), np.min(predictions))
+            max_val = max(np.max(actuals), np.max(predictions))
+            axes[1, 0].plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8)
+
+        axes[1, 0].set_title('Actual vs Predicted (Scatter)')
+        axes[1, 0].set_xlabel('Actual Price ($)')
+        axes[1, 0].set_ylabel('Predicted Price ($)')
+        axes[1, 0].grid(True, alpha=0.3)
+
+        # Plot 4: Error distribution
+        if len(errors) > 0:
+            axes[1, 1].hist(errors, bins=min(20, len(errors)), alpha=0.7, edgecolor='black')
+            axes[1, 1].axvline(x=np.mean(errors), color='red', linestyle='--',
+                              label=f'Mean: {np.mean(errors):.3f}')
+            axes[1, 1].legend()
+
+        axes[1, 1].set_title('Distribution of Prediction Errors')
+        axes[1, 1].set_xlabel('Error ($)')
+        axes[1, 1].set_ylabel('Frequency')
+        axes[1, 1].grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        plt.show()
+
+# ========================================================
+# 2. PRODUCTION FORECASTING PIPELINE (NOTEBOOK VERSION)
+# ========================================================
+
+class NotebookForecastingPipeline:
+    """Production forecasting pipeline with notebook output"""
+
+    def __init__(self, forecast_horizon=12):
+        self.forecast_horizon = forecast_horizon
+        self.models = {}
+        self.scalers = {}
+
+    def train_and_forecast(self, data, category):
+        """Train model and generate forecasts with notebook output"""
+
+        print(f" PRODUCTION FORECASTING: {category}")
+        print("=" * 60)
+
+        # Prepare data
+        cat_data = data[data['category'] == category].copy()
+        cat_data = cat_data[cat_data['price_lag_52'].notna()]
+
+        if len(cat_data) < 104:
+            print(f" Insufficient data for {category}: {len(cat_data)} records")
+            return None
+
+        # Feature selection
+        feature_cols = [
+            'discounted_price', 'price_lag_1', 'price_lag_7', 'ma_7d', 'ma_30d',
+            'promo_flag', 'has_holiday_event', 'has_seasonal_event',
+            'month_sin', 'month_cos', 'volatility_4w'
+        ]
+
+        # Aggregate to weekly level
+        weekly_data = cat_data.groupby('date')[feature_cols].mean().reset_index()
+        weekly_data = weekly_data.sort_values('date')
+
+        print(f" Training Data Summary:")
+        print(f"   • Total weeks: {len(weekly_data)}")
+        print(f"   • Date range: {weekly_data['date'].min().strftime('%Y-%m-%d')} to {weekly_data['date'].max().strftime('%Y-%m-%d')}")
+        print(f"   • Average price: ${weekly_data['discounted_price'].mean():.2f}")
+
+        # Scale data
+        scaler = MinMaxScaler()
+        scaled_data = scaler.fit_transform(weekly_data[feature_cols])
+
+        # Create sequences
+        sequence_length = 12
+        X, y = [], []
+        for i in range(sequence_length, len(scaled_data)):
+            X.append(scaled_data[i-sequence_length:i])
+            y.append(scaled_data[i, 0])
+
+        X, y = np.array(X), np.array(y)
+
+        # Train/validation split
+        split_idx = int(len(X) * 0.8)
+        X_train, X_val = X[:split_idx], X[split_idx:]
+        y_train, y_val = y[:split_idx], y[split_idx:]
+
+        print(f"   • Training sequences: {len(X_train)}")
+        print(f"   • Validation sequences: {len(X_val)}")
+
+        # Build model
+        model = Sequential([
+            Input(shape=(sequence_length, len(feature_cols))),
+            LSTM(64, return_sequences=True, dropout=0.2),
+            LSTM(32, dropout=0.2),
+            Dense(16, activation='relu'),
+            Dropout(0.2),
+            Dense(1)
+        ])
+
+        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
+
+        print(f"\n🔧 Training Model...")
+
+        # Train model
+        history = model.fit(
+            X_train, y_train,
+            validation_data=(X_val, y_val),
+            epochs=50,
+            batch_size=16,
+            callbacks=[
+                EarlyStopping(patience=10, restore_best_weights=True),
+                ReduceLROnPlateau(patience=5, factor=0.5)
+            ],
+            verbose=0
+        )
+
+        # Evaluate model
+        val_loss = min(history.history['val_loss'])
+        val_mae = min(history.history['val_mae'])
+
+        print(f" Training Complete:")
+        print(f"   • Epochs trained: {len(history.history['loss'])}")
+        print(f"   • Final validation loss: {val_loss:.6f}")
+        print(f"   • Final validation MAE: {val_mae:.6f}")
+
+        # Generate forecasts
+        print(f"\n Generating {self.forecast_horizon}-Week Forecast...")
+
+        forecasts = []
+        confidence_intervals = []
+        current_sequence = scaled_data[-sequence_length:]
+
+        for week in range(self.forecast_horizon):
+            # Predict next week
+            X_pred = current_sequence.reshape(1, sequence_length, len(feature_cols))
+            y_pred_scaled = model.predict(X_pred, verbose=0)[0, 0]
+
+            # Inverse transform prediction
+            dummy_array = np.zeros((1, len(feature_cols)))
+            dummy_array[0, 0] = y_pred_scaled
+            y_pred_actual = scaler.inverse_transform(dummy_array)[0, 0]
+
+            forecasts.append(y_pred_actual)
+
+            # Generate confidence interval (Monte Carlo dropout)
+            mc_predictions = []
+            for _ in range(100):
+                mc_pred = model.predict(X_pred, verbose=0)[0, 0]
+                dummy_mc = np.zeros((1, len(feature_cols)))
+                dummy_mc[0, 0] = mc_pred
+                mc_actual = scaler.inverse_transform(dummy_mc)[0, 0]
+                mc_predictions.append(mc_actual)
+
+            ci_lower = np.percentile(mc_predictions, 5)
+            ci_upper = np.percentile(mc_predictions, 95)
+            confidence_intervals.append((ci_lower, ci_upper))
+
+            # Update sequence for next prediction
+            next_week_features = current_sequence[-1].copy()
+            next_week_features[0] = y_pred_scaled
+            current_sequence = np.vstack([current_sequence[1:], next_week_features])
+
+        # Create forecast dates
+        last_date = weekly_data['date'].max()
+        forecast_dates = pd.date_range(start=last_date + timedelta(weeks=1),
+                                     periods=self.forecast_horizon, freq='W')
+
+        # Display forecast results
+        self._display_forecast_results(weekly_data, forecasts, forecast_dates, confidence_intervals, category)
+
+        # Create forecast visualization
+        self._create_forecast_plots(weekly_data, forecasts, forecast_dates, confidence_intervals, category)
+
+        # Store models for reuse
+        self.models[category] = model
+        self.scalers[category] = scaler
+
+        return {
+            'category': category,
+            'forecasts': forecasts,
+            'forecast_dates': forecast_dates.tolist(),
+            'confidence_intervals': confidence_intervals,
+            'last_actual_price': weekly_data['discounted_price'].iloc[-1],
+            'training_metrics': {'val_loss': val_loss, 'val_mae': val_mae}
+        }
+
+    def _display_forecast_results(self, weekly_data, forecasts, forecast_dates, confidence_intervals, category):
+        """Display forecast results in notebook"""
+
+        print(f"\n FORECAST RESULTS: {category}")
+        print("=" * 50)
+
+        last_price = weekly_data['discounted_price'].iloc[-1]
+
+        print(f" Price Forecasts:")
+        print(f"   • Current price: ${last_price:.2f}")
+        print(f"   • 4-week forecast: ${forecasts[3]:.2f}")
+        print(f"   • 8-week forecast: ${forecasts[7]:.2f}")
+        print(f"   • 12-week forecast: ${forecasts[11]:.2f}")
+
+        print(f"\n Confidence Intervals (90%):")
+        print(f"   • 4-week: ${confidence_intervals[3][0]:.2f} - ${confidence_intervals[3][1]:.2f}")
+        print(f"   • 8-week: ${confidence_intervals[7][0]:.2f} - ${confidence_intervals[7][1]:.2f}")
+        print(f"   • 12-week: ${confidence_intervals[11][0]:.2f} - ${confidence_intervals[11][1]:.2f}")
+
+        # Price change analysis
+        change_4w = ((forecasts[3] - last_price) / last_price) * 100
+        change_12w = ((forecasts[11] - last_price) / last_price) * 100
+
+        print(f"\n Expected Price Changes:")
+        print(f"   • 4-week change: {change_4w:+.1f}%")
+        print(f"   • 12-week change: {change_12w:+.1f}%")
+
+        # Trend analysis
+        if change_12w > 5:
+            trend = " Rising trend"
+        elif change_12w < -5:
+            trend = " Declining trend"
+        else:
+            trend = " Stable trend"
+
+        print(f"   • Overall trend: {trend}")
+
+    def _create_forecast_plots(self, weekly_data, forecasts, forecast_dates, confidence_intervals, category):
+        """Create forecast visualization plots"""
+
+        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+        fig.suptitle(f'Production Forecasting Results: {category}', fontsize=16, fontweight='bold')
+
+        # Prepare data for plotting
+        historical_dates = weekly_data['date'].tail(52)  # Last year
+        historical_prices = weekly_data['discounted_price'].tail(52)
+
+        # Plot 1: Main forecast chart
+        axes[0, 0].plot(historical_dates, historical_prices, label='Historical', linewidth=2, alpha=0.8)
+        axes[0, 0].plot(forecast_dates, forecasts, label='Forecast', linewidth=2, color='red')
+
+        # Add confidence intervals
+        ci_lower, ci_upper = zip(*confidence_intervals)
+        axes[0, 0].fill_between(forecast_dates, ci_lower, ci_upper, alpha=0.3, color='red', label='90% CI')
+
+        axes[0, 0].set_title('Price Forecast with Confidence Intervals')
+        axes[0, 0].set_xlabel('Date')
+        axes[0, 0].set_ylabel('Price ($)')
+        axes[0, 0].legend()
+        axes[0, 0].grid(True, alpha=0.3)
+
+        # Plot 2: Forecast horizon detail
+        all_dates = list(historical_dates.tail(12)) + list(forecast_dates)
+        all_prices = list(historical_prices.tail(12)) + forecasts
+
+        axes[0, 1].plot(all_dates[:12], all_prices[:12], 'b-', linewidth=2, label='Recent History')
+        axes[0, 1].plot(all_dates[11:], all_prices[11:], 'r-', linewidth=2, label='Forecast')
+        axes[0, 1].scatter([all_dates[11]], [all_prices[11]], color='black', s=50, zorder=5)
+
+        axes[0, 1].set_title('Recent History vs Forecast')
+        axes[0, 1].set_xlabel('Date')
+        axes[0, 1].set_ylabel('Price ($)')
+        axes[0, 1].legend()
+        axes[0, 1].grid(True, alpha=0.3)
+
+        # Plot 3: Forecast uncertainty
+        weeks = np.arange(1, len(forecasts) + 1)
+        uncertainty = np.array(ci_upper) - np.array(ci_lower)
+
+        axes[1, 0].plot(weeks, uncertainty, marker='o', linewidth=2)
+        axes[1, 0].set_title('Forecast Uncertainty Over Time')
+        axes[1, 0].set_xlabel('Weeks Ahead')
+        axes[1, 0].set_ylabel('Uncertainty Range ($)')
+        axes[1, 0].grid(True, alpha=0.3)
+
+        # Plot 4: Price change distribution
+        price_changes = []
+        current_price = historical_prices.iloc[-1]
+
+        for forecast in forecasts:
+            change = ((forecast - current_price) / current_price) * 100
+            price_changes.append(change)
+
+        axes[1, 1].bar(weeks, price_changes, alpha=0.7)
+        axes[1, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
+        axes[1, 1].set_title('Expected Price Changes (%)')
+        axes[1, 1].set_xlabel('Weeks Ahead')
+        axes[1, 1].set_ylabel('Price Change (%)')
+        axes[1, 1].grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        plt.show()
+
+# ========================================================
+# 3. MAIN EXECUTION FUNCTIONS
+# ========================================================
+
+def run_notebook_validation(data, categories=None):
+    """Run validation framework with notebook output"""
+
+    print(" WALK-FORWARD VALIDATION FRAMEWORK")
+    print("=" * 80)
+
+    if categories is None:
+        categories = ['Pantry', 'Dairy, Eggs & Fridge', 'Health & Beauty']
+
+    validator = NotebookWalkForwardValidator()
+    all_results = {}
+
+    for category in categories:
+        print(f"\n")
+        results = validator.validate_lstm_model(data, category)
+        if results:
+            all_results[category] = results
+
+    # Overall comparison
+    if all_results:
+        print(f"\n OVERALL VALIDATION COMPARISON")
+        print("=" * 60)
+
+        comparison_data = []
+        for category, results in all_results.items():
+            mae_scores = [r['metrics']['MAE'] for r in results]
+            mape_scores = [r['metrics']['MAPE'] for r in results]
+
+            comparison_data.append({
+                'Category': category,
+                'Avg MAE': np.mean(mae_scores),
+                'Std MAE': np.std(mae_scores),
+                'Avg MAPE': np.mean(mape_scores),
+                'Best MAE': min(mae_scores),
+                'Periods': len(results)
+            })
+
+        comparison_df = pd.DataFrame(comparison_data)
+        comparison_df = comparison_df.sort_values('Avg MAE')
+
+        print("\n Performance Ranking:")
+        for i, row in comparison_df.iterrows():
+            print(f"   {comparison_df.index.get_loc(i)+1}. {row['Category']:<20} | "
+                  f"MAE: {row['Avg MAE']:.3f} ± {row['Std MAE']:.3f} | "
+                  f"MAPE: {row['Avg MAPE']:.1f}%")
+
+    return all_results
+
+def run_notebook_forecasting(data, categories=None):
+    """Run production forecasting with notebook output"""
+
+    print("PRODUCTION FORECASTING PIPELINE")
+    print("=" * 80)
+
+    if categories is None:
+        categories = ['Pantry', 'Dairy, Eggs & Fridge', 'Health & Beauty']
+
+    pipeline = NotebookForecastingPipeline(forecast_horizon=12)
+    all_forecasts = {}
+
+    for category in categories:
+        print(f"\n")
+        forecast_result = pipeline.train_and_forecast(data, category)
+        if forecast_result:
+            all_forecasts[category] = forecast_result
+
+    # Summary comparison
+    if all_forecasts:
+        print(f"\nFORECAST SUMMARY COMPARISON")
+        print("=" * 60)
+
+        summary_data = []
+        for category, result in all_forecasts.items():
+            current_price = result['last_actual_price']
+            forecast_4w = result['forecasts'][3]
+            forecast_12w = result['forecasts'][11]
+
+            change_4w = ((forecast_4w - current_price) / current_price) * 100
+            change_12w = ((forecast_12w - current_price) / current_price) * 100
+
+            summary_data.append({
+                'Category': category,
+                'Current ($)': current_price,
+                '4W Forecast ($)': forecast_4w,
+                '12W Forecast ($)': forecast_12w,
+                '4W Change (%)': change_4w,
+                '12W Change (%)': change_12w
+            })
+
+        summary_df = pd.DataFrame(summary_data)
+
+        print("\nPrice Forecast Summary:")
+        for _, row in summary_df.iterrows():
+            print(f"   {row['Category']:<20} | "
+                  f"Current: ${row['Current ($)']:.2f} | "
+                  f"4W: ${row['4W Forecast ($)']:.2f} ({row['4W Change (%)']:+.1f}%) | "
+                  f"12W: ${row['12W Forecast ($)']:.2f} ({row['12W Change (%)']:+.1f}%)")
+
+    return all_forecasts
+
+def run_complete_notebook_pipeline(data_path='grocery_data_with_features.csv'):
+    """Execute complete pipeline with notebook output"""
+
+    print("COMPLETE FORECASTING PIPELINE EXECUTION")
+    print("=" * 80)
+
+    # Load data
+    print("Loading data...")
+    try:
+        data = pd.read_csv(data_path)
+        data['date'] = pd.to_datetime(data['date'])
+        print(f"Loaded {len(data):,} records from {data_path}")
+    except FileNotFoundError:
+        print(f"Data file not found: {data_path}")
+        print("Please ensure 'grocery_data_with_features.csv' exists in your current directory")
+        return None
+
+    # Define categories
+    categories = ['Pantry', 'Dairy, Eggs & Fridge', 'Health & Beauty']
+    print(f"Target categories: {categories}")
+
+    # Phase 1: Validation
+    print(f"\nPHASE 1: WALK-FORWARD VALIDATION")
+    print("=" * 60)
+
+    validation_results = run_notebook_validation(data, categories)
+
+    # Phase 2: Production Forecasting
+    print(f"\nPHASE 2: PRODUCTION FORECASTING")
+    print("=" * 60)
+
+    forecast_results = run_notebook_forecasting(data, categories)
+
+    # Phase 3: Final Analysis
+    print(f"\nPHASE 3: ANALYSIS AND RECOMMENDATIONS")
+    print("=" * 60)
+
+    if validation_results and forecast_results:
+
+        print("\nModel Performance Analysis:")
+        print("-" * 30)
+
+        for category in categories:
+            if category in validation_results:
+                val_results = validation_results[category]
+                mae_scores = [r['metrics']['MAE'] for r in val_results]
+                avg_mae = np.mean(mae_scores)
+
+                if category in forecast_results:
+                    print(f"{category}:")
+                    print(f"   Validation MAE: {avg_mae:.3f}")
+                    print(f"   Validation periods: {len(val_results)}")
+
+                    # Forecast confidence assessment
+                    forecast_result = forecast_results[category]
+                    ci_width_4w = forecast_result['confidence_intervals'][3][1] - forecast_result['confidence_intervals'][3][0]
+                    ci_width_12w = forecast_result['confidence_intervals'][11][1] - forecast_result['confidence_intervals'][11][0]
+
+                    print(f"   4W CI width: ${ci_width_4w:.2f}")
+                    print(f"   12W CI width: ${ci_width_12w:.2f}")
+
+        print("\nKey Insights:")
+        print("-" * 15)
+
+        # Find best performing category
+        best_category = None
+        best_mae = float('inf')
+
+        for category in categories:
+            if category in validation_results:
+                val_results = validation_results[category]
+                mae_scores = [r['metrics']['MAE'] for r in val_results]
+                avg_mae = np.mean(mae_scores)
+
+                if avg_mae < best_mae:
+                    best_mae = avg_mae
+                    best_category = category
+
+        if best_category:
+            print(f"   Best performing model: {best_category} (MAE: {best_mae:.3f})")
+
+        # Forecast trend analysis
+        rising_categories = []
+        stable_categories = []
+        declining_categories = []
+
+        for category, result in forecast_results.items():
+            current_price = result['last_actual_price']
+            forecast_12w = result['forecasts'][11]
+            change_12w = ((forecast_12w - current_price) / current_price) * 100
+
+            if change_12w > 3:
+                rising_categories.append(f"{category} (+{change_12w:.1f}%)")
+            elif change_12w < -3:
+                declining_categories.append(f"{category} ({change_12w:.1f}%)")
+            else:
+                stable_categories.append(f"{category} ({change_12w:+.1f}%)")
+
+        if rising_categories:
+            print(f"   Rising prices (12W): {', '.join(rising_categories)}")
+        if declining_categories:
+            print(f"   Declining prices (12W): {', '.join(declining_categories)}")
+        if stable_categories:
+            print(f"   Stable prices (12W): {', '.join(stable_categories)}")
+
+        print("\nRecommendations:")
+        print("-" * 16)
+
+        recommendations = []
+
+        # Performance-based recommendations
+        for category in categories:
+            if category in validation_results:
+                val_results = validation_results[category]
+                mae_scores = [r['metrics']['MAE'] for r in val_results]
+                avg_mae = np.mean(mae_scores)
+                std_mae = np.std(mae_scores)
+
+                if avg_mae > 1.0:
+                    recommendations.append(f"Consider ensemble methods for {category} (high error rate)")
+
+                if std_mae > avg_mae * 0.5:
+                    recommendations.append(f"Investigate {category} prediction variability")
+
+        # Forecast-based recommendations
+        for category, result in forecast_results.items():
+            ci_width_12w = result['confidence_intervals'][11][1] - result['confidence_intervals'][11][0]
+            avg_price = result['last_actual_price']
+
+            if ci_width_12w / avg_price > 0.3:  # CI width > 30% of price
+                recommendations.append(f"High uncertainty in {category} 12-week forecasts")
+
+        if not recommendations:
+            recommendations.append("All models show good performance and reasonable uncertainty")
+
+        for i, rec in enumerate(recommendations, 1):
+            print(f"   {i}. {rec}")
+
+    print(f"\nPIPELINE EXECUTION COMPLETE")
+    print("=" * 80)
+    print("All results displayed above. Models are ready for production use.")
+
+    return {
+        'validation_results': validation_results,
+        'forecast_results': forecast_results,
+        'execution_timestamp': datetime.now().isoformat()
+    }
+
+# ========================================================
+# 4. MONITORING AND ALERTS (NOTEBOOK VERSION)
+# ========================================================
+
+class NotebookForecastMonitor:
+    """Monitor forecast performance with notebook alerts"""
+
+    def __init__(self):
+        self.alert_thresholds = {
+            'mae_threshold': 2.0,
+            'mape_threshold': 15.0,
+            'bias_threshold': 1.0
+        }
+
+    def check_model_performance(self, validation_results):
+        """Check model performance and display alerts"""
+
+        print("MODEL PERFORMANCE MONITORING")
+        print("=" * 40)
+
+        alerts = []
+
+        for category, results in validation_results.items():
+            if not results:
+                continue
+
+            mae_scores = [r['metrics']['MAE'] for r in results]
+            mape_scores = [r['metrics']['MAPE'] for r in results]
+
+            avg_mae = np.mean(mae_scores)
+            avg_mape = np.mean(mape_scores)
+
+            # Check thresholds
+            if avg_mae > self.alert_thresholds['mae_threshold']:
+                severity = 'HIGH' if avg_mae > self.alert_thresholds['mae_threshold'] * 2 else 'MEDIUM'
+                alerts.append({
+                    'category': category,
+                    'type': 'High MAE',
+                    'value': avg_mae,
+                    'severity': severity
+                })
+
+            if avg_mape > self.alert_thresholds['mape_threshold']:
+                severity = 'HIGH' if avg_mape > self.alert_thresholds['mape_threshold'] * 2 else 'MEDIUM'
+                alerts.append({
+                    'category': category,
+                    'type': 'High MAPE',
+                    'value': avg_mape,
+                    'severity': severity
+                })
+
+        if alerts:
+            print("PERFORMANCE ALERTS:")
+            for alert in alerts:
+                print(f"   [{alert['severity']}] {alert['category']}: {alert['type']} = {alert['value']:.3f}")
+        else:
+            print("No performance alerts. All models within acceptable thresholds.")
+
+        return alerts
+
+    def check_data_quality(self, data):
+        """Check data quality and display alerts"""
+
+        print("\nDATA QUALITY MONITORING")
+        print("=" * 30)
+
+        alerts = []
+
+        # Missing data check
+        missing_pct = data.isnull().sum().sum() / (len(data) * len(data.columns)) * 100
+        if missing_pct > 5:
+            alerts.append(f"High missing data rate: {missing_pct:.1f}%")
+
+        # Price outlier check
+        if 'discounted_price' in data.columns:
+            q99 = data['discounted_price'].quantile(0.99)
+            q01 = data['discounted_price'].quantile(0.01)
+            outlier_pct = ((data['discounted_price'] > q99) |
+                          (data['discounted_price'] < q01)).mean() * 100
+
+            if outlier_pct > 3:
+                alerts.append(f"High price outlier rate: {outlier_pct:.1f}%")
+
+        # Recent data check
+        if 'date' in data.columns:
+            latest_date = data['date'].max()
+            days_old = (datetime.now() - latest_date).days
+
+            if days_old > 14:
+                alerts.append(f"Data is {days_old} days old")
+
+        if alerts:
+            print("DATA QUALITY ALERTS:")
+            for alert in alerts:
+                print(f"   WARNING: {alert}")
+        else:
+            print("Data quality checks passed.")
+
+        return alerts
+
+# ========================================================
+# 5. EXECUTION EXAMPLE
+# ========================================================
+
+def execute_pipeline_example():
+    """Example of how to run the complete pipeline"""
+
+    print("FORECASTING PIPELINE EXECUTION EXAMPLE")
+    print("=" * 50)
+    print("\nTo run the complete pipeline, use:")
+    print("results = run_complete_notebook_pipeline()")
+    print("\nOr run components separately:")
+    print("validation_results = run_notebook_validation(data)")
+    print("forecast_results = run_notebook_forecasting(data)")
+
+    print("\nThis will display:")
+    print("   - Walk-forward validation results with plots")
+    print("   - Production forecasts with confidence intervals")
+    print("   - Performance comparisons and recommendations")
+    print("   - All outputs directly in the notebook")
+
+# ========================================================
+# 6. READY TO EXECUTE
+# ========================================================
+
+print("NOTEBOOK FORECASTING PIPELINE LOADED")
+print("=" * 40)
+print("Ready to execute! Run:")
+print("results = run_complete_notebook_pipeline()")
+print("\nOr if you want to test individual components:")
+print("validation_results = run_notebook_validation(df_with_features)")
+print("forecast_results = run_notebook_forecasting(df_with_features)")
+
+# Execute the complete pipeline
+results = run_complete_notebook_pipeline()
+