From 7529e83a8a09b7bed579bd14ade6de92a6a0bab5 Mon Sep 17 00:00:00 2001
From: aumpatel <op88814@gmail.com>
Date: Fri, 28 Nov 2025 18:45:13 -0500
Subject: [PATCH 1/5] refactor: separate metric definition from metric value
 and share comparator instances

- Add _MetricDefinition class to separate metric type (definition) from metric value (measurement)
- Implement comparator caching via _get_shared_comparator() to ensure only one comparator object exists per unique (method, target, epsilon) combination
- All solutions using the same comparison method now share the same comparator instance, reducing memory usage
- Maintain full backward compatibility - no changes to Metric class API
- Remove TODO comment as the refactoring addresses the concern about mixing metric and metric value concerns
---
 plexe/internal/models/entities/metric.py | 144 ++++++++++++++++++++---
 1 file changed, 127 insertions(+), 17 deletions(-)

diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py
index f5355984..078954e9 100644
--- a/plexe/internal/models/entities/metric.py
+++ b/plexe/internal/models/entities/metric.py
@@ -22,6 +22,8 @@
 
 from enum import Enum
 from functools import total_ordering
+from typing import Optional
+from weakref import WeakValueDictionary
 
 
 class ComparisonMethod(Enum):
@@ -98,31 +100,138 @@ def compare(self, value1: float, value2: float) -> int:
             raise ValueError("Invalid comparison method.")
 
 
-# todo: this class is a mess as it mixes concerns of a metric and a metric value; needs refactoring
+# Internal cache for sharing MetricComparator instances across all metrics
+# This ensures only one comparator object exists per unique (method, target, epsilon) combination
+_comparator_cache: WeakValueDictionary = WeakValueDictionary()
+
+
+def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9) -> MetricComparator:
+    """
+    Get or create a shared MetricComparator instance.
+    
+    This function ensures that identical comparators are reused across all Metric instances,
+    reducing memory usage and ensuring consistency.
+    
+    :param comparison_method: The comparison method.
+    :param target: Optional target value for TARGET_IS_BETTER.
+    :param epsilon: Tolerance for floating-point comparisons.
+    :return: A shared MetricComparator instance.
+    """
+    # Create a cache key from the comparator parameters
+    cache_key = (comparison_method, target, epsilon)
+    
+    # Try to get existing comparator from cache
+    if cache_key in _comparator_cache:
+        return _comparator_cache[cache_key]
+    
+    # Create new comparator and cache it
+    comparator = MetricComparator(comparison_method, target, epsilon)
+    _comparator_cache[cache_key] = comparator
+    return comparator
+
+
+class _MetricDefinition:
+    """
+    Internal class representing a metric type definition.
+    
+    This separates the metric definition (what it is) from the metric value (a measurement).
+    Metric definitions are immutable and can be shared across multiple metric values.
+    
+    This is an internal implementation detail - users should not interact with this class directly.
+    """
+    
+    def __init__(self, name: str, comparator: MetricComparator):
+        """
+        Initialize a metric definition.
+        
+        :param name: The name of the metric.
+        :param comparator: The shared comparator instance.
+        """
+        self._name = name
+        self._comparator = comparator
+    
+    @property
+    def name(self) -> str:
+        """The name of the metric."""
+        return self._name
+    
+    @property
+    def comparator(self) -> MetricComparator:
+        """The shared comparator instance."""
+        return self._comparator
+    
+    def __eq__(self, other) -> bool:
+        """Check if two metric definitions are equal."""
+        if not isinstance(other, _MetricDefinition):
+            return False
+        return (
+            self.name == other.name
+            and self.comparator.comparison_method == other.comparator.comparison_method
+            and self.comparator.target == other.comparator.target
+        )
+    
+    def __hash__(self) -> int:
+        """Hash the metric definition."""
+        return hash((self.name, self.comparator.comparison_method, self.comparator.target))
+
+
 @total_ordering
 class Metric:
     """
     Represents a metric with a name, a value, and a comparator for determining which metric is better.
 
+    This class internally separates the metric definition (type) from the metric value (measurement),
+    and automatically shares comparator instances to reduce memory usage.
+
     Attributes:
         name (str): The name of the metric (e.g., 'accuracy', 'loss').
         value (float): The numeric value of the metric.
-        comparator (MetricComparator): The comparison logic for the metric.
+        comparator (MetricComparator): The comparison logic for the metric (shared instance).
     """
 
     def __init__(self, name: str, value: float = None, comparator: MetricComparator = None, is_worst: bool = False):
         """
         Initializes a Metric object.
 
+        The comparator instance is automatically shared with other metrics that have the same
+        comparison method, target, and epsilon values, reducing memory usage.
+
         :param name: The name of the metric.
         :param value: The numeric value of the metric.
         :param comparator: An instance of MetricComparator for comparison logic.
         :param is_worst: Indicates if the metric value is the worst possible value.
         """
-        self.name = name
+        # Store the metric value (dynamic, instance-specific)
         self.value = value
-        self.comparator = comparator
         self.is_worst = is_worst or value is None
+        
+        # Get or create a shared comparator instance
+        if comparator is not None:
+            # Use the shared comparator cache to ensure we reuse identical comparators
+            # This is the key optimization: identical comparators are shared across all metrics
+            shared_comparator = _get_shared_comparator(
+                comparison_method=comparator.comparison_method,
+                target=comparator.target,
+                epsilon=comparator.epsilon
+            )
+        else:
+            # If no comparator provided, raise an error as it's required for a valid metric
+            # This maintains the same behavior as before
+            raise ValueError("Metric requires a comparator. Provide a MetricComparator instance.")
+        
+        # Create internal metric definition (separates type from value)
+        # This is the key separation: definition (what it is) vs value (measurement)
+        self._definition = _MetricDefinition(name=name, comparator=shared_comparator)
+    
+    @property
+    def name(self) -> str:
+        """The name of the metric (for backward compatibility)."""
+        return self._definition.name
+    
+    @property
+    def comparator(self) -> MetricComparator:
+        """The shared comparator instance (for backward compatibility)."""
+        return self._definition.comparator
 
     def __gt__(self, other) -> bool:
         """
@@ -141,17 +250,18 @@ def __gt__(self, other) -> bool:
         if other.is_worst:
             return True
 
-        if self.name != other.name:
-            raise ValueError("Cannot compare metrics with different names.")
-
-        if self.comparator.comparison_method != other.comparator.comparison_method:
-            raise ValueError("Cannot compare metrics with different comparison methods.")
-
-        if (
-            self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER
-            and self.comparator.target != other.comparator.target
-        ):
-            raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.")
+        # Compare using definitions - this is cleaner and ensures consistency
+        if self._definition != other._definition:
+            # Provide detailed error message for backward compatibility
+            if self.name != other.name:
+                raise ValueError("Cannot compare metrics with different names.")
+            if self.comparator.comparison_method != other.comparator.comparison_method:
+                raise ValueError("Cannot compare metrics with different comparison methods.")
+            if (
+                self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER
+                and self.comparator.target != other.comparator.target
+            ):
+                raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.")
 
         return self.comparator.compare(self.value, other.value) < 0
 
@@ -171,9 +281,9 @@ def __eq__(self, other) -> bool:
         if self.is_worst or other.is_worst:
             return False
 
+        # Use definition equality for cleaner comparison
         return (
-            self.name == other.name
-            and self.comparator.comparison_method == other.comparator.comparison_method
+            self._definition == other._definition
             and self.comparator.compare(self.value, other.value) == 0
         )
 

From f413dd3405c609c3553bc6ba3db913f00dd9ba7b Mon Sep 17 00:00:00 2001
From: aumpatel <op88814@gmail.com>
Date: Fri, 28 Nov 2025 18:55:00 -0500
Subject: [PATCH 2/5] fix: include epsilon in MetricDefinition equality and
 hash methods

- Add epsilon to __eq__ method to ensure metrics with different epsilon values are correctly differentiated
- Add epsilon to __hash__ method to maintain hash contract (must include all fields used in __eq__)
- Fix redundant condition in Metric.__gt__ method

This fixes critical bugs identified in code review that could cause incorrect metric comparisons.
---
 plexe/internal/models/entities/metric.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py
index 078954e9..0478b6db 100644
--- a/plexe/internal/models/entities/metric.py
+++ b/plexe/internal/models/entities/metric.py
@@ -168,11 +168,12 @@ def __eq__(self, other) -> bool:
             self.name == other.name
             and self.comparator.comparison_method == other.comparator.comparison_method
             and self.comparator.target == other.comparator.target
+            and self.comparator.epsilon == other.comparator.epsilon
         )
     
     def __hash__(self) -> int:
         """Hash the metric definition."""
-        return hash((self.name, self.comparator.comparison_method, self.comparator.target))
+        return hash((self.name, self.comparator.comparison_method, self.comparator.target, self.comparator.epsilon))
 
 
 @total_ordering
@@ -244,7 +245,7 @@ def __gt__(self, other) -> bool:
         if not isinstance(other, Metric):
             return NotImplemented
 
-        if self.is_worst or (self.is_worst and other.is_worst):
+        if self.is_worst:
             return False
 
         if other.is_worst:

From ac5284323911b43066ef2177e2f112aaf5ecf429 Mon Sep 17 00:00:00 2001
From: aumpatel <op88814@gmail.com>
Date: Fri, 28 Nov 2025 19:03:11 -0500
Subject: [PATCH 3/5] style: format metric.py with black

- Reformat code to comply with black formatting standards
- Fixes CI formatting check failure
---
 plexe/internal/models/entities/metric.py | 45 +++++++++++-------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py
index 0478b6db..ce313f05 100644
--- a/plexe/internal/models/entities/metric.py
+++ b/plexe/internal/models/entities/metric.py
@@ -105,13 +105,15 @@ def compare(self, value1: float, value2: float) -> int:
 _comparator_cache: WeakValueDictionary = WeakValueDictionary()
 
 
-def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9) -> MetricComparator:
+def _get_shared_comparator(
+    comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9
+) -> MetricComparator:
     """
     Get or create a shared MetricComparator instance.
-    
+
     This function ensures that identical comparators are reused across all Metric instances,
     reducing memory usage and ensuring consistency.
-    
+
     :param comparison_method: The comparison method.
     :param target: Optional target value for TARGET_IS_BETTER.
     :param epsilon: Tolerance for floating-point comparisons.
@@ -119,11 +121,11 @@ def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional
     """
     # Create a cache key from the comparator parameters
     cache_key = (comparison_method, target, epsilon)
-    
+
     # Try to get existing comparator from cache
     if cache_key in _comparator_cache:
         return _comparator_cache[cache_key]
-    
+
     # Create new comparator and cache it
     comparator = MetricComparator(comparison_method, target, epsilon)
     _comparator_cache[cache_key] = comparator
@@ -133,33 +135,33 @@ def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional
 class _MetricDefinition:
     """
     Internal class representing a metric type definition.
-    
+
     This separates the metric definition (what it is) from the metric value (a measurement).
     Metric definitions are immutable and can be shared across multiple metric values.
-    
+
     This is an internal implementation detail - users should not interact with this class directly.
     """
-    
+
     def __init__(self, name: str, comparator: MetricComparator):
         """
         Initialize a metric definition.
-        
+
         :param name: The name of the metric.
         :param comparator: The shared comparator instance.
         """
         self._name = name
         self._comparator = comparator
-    
+
     @property
     def name(self) -> str:
         """The name of the metric."""
         return self._name
-    
+
     @property
     def comparator(self) -> MetricComparator:
         """The shared comparator instance."""
         return self._comparator
-    
+
     def __eq__(self, other) -> bool:
         """Check if two metric definitions are equal."""
         if not isinstance(other, _MetricDefinition):
@@ -170,7 +172,7 @@ def __eq__(self, other) -> bool:
             and self.comparator.target == other.comparator.target
             and self.comparator.epsilon == other.comparator.epsilon
         )
-    
+
     def __hash__(self) -> int:
         """Hash the metric definition."""
         return hash((self.name, self.comparator.comparison_method, self.comparator.target, self.comparator.epsilon))
@@ -205,30 +207,28 @@ def __init__(self, name: str, value: float = None, comparator: MetricComparator
         # Store the metric value (dynamic, instance-specific)
         self.value = value
         self.is_worst = is_worst or value is None
-        
+
         # Get or create a shared comparator instance
         if comparator is not None:
             # Use the shared comparator cache to ensure we reuse identical comparators
             # This is the key optimization: identical comparators are shared across all metrics
             shared_comparator = _get_shared_comparator(
-                comparison_method=comparator.comparison_method,
-                target=comparator.target,
-                epsilon=comparator.epsilon
+                comparison_method=comparator.comparison_method, target=comparator.target, epsilon=comparator.epsilon
             )
         else:
             # If no comparator provided, raise an error as it's required for a valid metric
             # This maintains the same behavior as before
             raise ValueError("Metric requires a comparator. Provide a MetricComparator instance.")
-        
+
         # Create internal metric definition (separates type from value)
         # This is the key separation: definition (what it is) vs value (measurement)
         self._definition = _MetricDefinition(name=name, comparator=shared_comparator)
-    
+
     @property
     def name(self) -> str:
         """The name of the metric (for backward compatibility)."""
         return self._definition.name
-    
+
     @property
     def comparator(self) -> MetricComparator:
         """The shared comparator instance (for backward compatibility)."""
@@ -283,10 +283,7 @@ def __eq__(self, other) -> bool:
             return False
 
         # Use definition equality for cleaner comparison
-        return (
-            self._definition == other._definition
-            and self.comparator.compare(self.value, other.value) == 0
-        )
+        return self._definition == other._definition and self.comparator.compare(self.value, other.value) == 0
 
     def __repr__(self) -> str:
         """

From 05e63dbaa44e247f6cbf8337f1a745666f723d50 Mon Sep 17 00:00:00 2001
From: aumpatel <op88814@gmail.com>
Date: Tue, 2 Dec 2025 17:10:25 -0500
Subject: [PATCH 4/5] Generate Sample Data using LLM

---
 CODEBASE_STRUCTURE.md                    | 446 ++++++++++++++
 COMPLETE_MODEL_BUILDING_WALKTHROUGH.md   | 703 +++++++++++++++++++++++
 plexe/internal/models/entities/metric.py | 152 +----
 plexe/tools/datasets.py                  | 100 +++-
 4 files changed, 1244 insertions(+), 157 deletions(-)
 create mode 100644 CODEBASE_STRUCTURE.md
 create mode 100644 COMPLETE_MODEL_BUILDING_WALKTHROUGH.md

diff --git a/CODEBASE_STRUCTURE.md b/CODEBASE_STRUCTURE.md
new file mode 100644
index 00000000..1c959314
--- /dev/null
+++ b/CODEBASE_STRUCTURE.md
@@ -0,0 +1,446 @@
+# Plexe Codebase Structure - Complete A-Z Documentation
+
+## Overview
+Plexe is an agentic framework for building ML models from natural language. It uses a multi-agent system where specialized AI agents collaborate to analyze data, plan solutions, train models, and generate production-ready inference code.
+
+## Architecture Summary
+- **Entry Points**: `main.py` (CLI), `server.py` (Web UI), `model_builder.py` (Core API)
+- **Core System**: Object registry, state management, storage, interfaces
+- **Agents**: 8 specialized agents for different ML tasks
+- **Tools**: 14 tool modules providing agent capabilities
+- **Internal**: Common utilities, models, datasets, schemas
+- **Testing**: Unit, integration, and benchmark tests
+
+---
+
+## A-Z File Structure and Dependencies
+
+### **A - Agents** (`plexe/agents/`)
+**Purpose**: Multi-agent system for ML model building
+
+#### `agents.py` - Main Orchestrator
+- **What it does**: Coordinates 8 specialized agents in ML workflow
+- **Key classes**: `PlexeAgent`, `ModelGenerationResult`
+- **Dependencies**: All other agent modules, tools, core modules
+- **Flow**: Creates agents → Runs orchestrator → Extracts results
+
+#### `conversational.py` - User Interface Agent
+- **What it does**: Guides users through ML requirements via chat
+- **Key classes**: `ConversationalAgent`
+- **Dependencies**: `tools.conversation`, `tools.datasets`
+- **Flow**: User input → Validation → Model build initiation
+
+#### `dataset_analyser.py` - EDA Agent
+- **What it does**: Performs exploratory data analysis on datasets
+- **Key classes**: `EdaAgent`
+- **Dependencies**: `tools.datasets`, `tools.schemas`
+- **Flow**: Dataset → Analysis → EDA report registration
+
+#### `dataset_splitter.py` - Data Splitting Agent
+- **What it does**: Intelligently splits datasets into train/val/test
+- **Key classes**: `DatasetSplitterAgent`
+- **Dependencies**: `tools.datasets`
+- **Flow**: Dataset → Split strategy → Registered splits
+
+#### `feature_engineer.py` - Feature Engineering Agent
+- **What it does**: Transforms raw data into optimized features
+- **Key classes**: `FeatureEngineeringAgent`
+- **Dependencies**: `tools.datasets`, `tools.execution`, `tools.validation`
+- **Flow**: Raw data → Transformations → Enhanced datasets
+
+#### `model_packager.py` - MLOps Agent
+- **What it does**: Creates production-ready inference code
+- **Key classes**: `ModelPackagerAgent`
+- **Dependencies**: `tools.context`, `tools.validation`, `tools.solutions`
+- **Flow**: Training code → Inference code → Production package
+
+#### `model_planner.py` - ML Research Agent
+- **What it does**: Plans ML approaches and solution strategies
+- **Key classes**: `ModelPlannerAgent`
+- **Dependencies**: `tools.datasets`, `tools.schemas`, `tools.solutions`
+- **Flow**: Requirements → Analysis → Solution plans
+
+#### `model_trainer.py` - ML Engineer Agent
+- **What it does**: Implements and executes training code
+- **Key classes**: `ModelTrainerAgent`
+- **Dependencies**: `tools.execution`, `tools.training`, `tools.validation`
+- **Flow**: Plan → Code generation → Training → Validation
+
+#### `model_tester.py` - Model Testing Agent
+- **What it does**: Tests and evaluates finalized models
+- **Key classes**: `ModelTesterAgent`
+- **Dependencies**: `tools.testing`, `tools.datasets`, `tools.schemas`
+- **Flow**: Model → Testing → Evaluation report
+
+#### `schema_resolver.py` - Schema Resolution Agent
+- **What it does**: Resolves and validates input/output schemas
+- **Key classes**: `SchemaResolverAgent`
+- **Dependencies**: `tools.schemas`
+- **Flow**: Data analysis → Schema inference → Validation
+
+---
+
+### **B - Core System** (`plexe/core/`)
+**Purpose**: Fundamental types and functionality used across Plexe
+
+#### `state.py` - Model State Management
+- **What it does**: Defines model lifecycle states
+- **Key classes**: `ModelState` (Enum)
+- **States**: DRAFT, BUILDING, READY, ERROR
+- **Dependencies**: None (base enum)
+
+#### `storage.py` - Model Persistence
+- **What it does**: Core implementation for saving/loading models and checkpoints
+- **Key functions**: `_save_model_to_tar()`, `_load_model_data_from_tar()`, `_save_checkpoint_to_tar()`
+- **Dependencies**: `core.state`, `config`, `internal.common.utils.pydantic_utils`
+- **Flow**: Model data → Tar archive → File system
+
+#### `object_registry.py` - Global Object Registry
+- **What it does**: Singleton registry for storing/retrieving objects by type and name
+- **Key classes**: `ObjectRegistry`, `Item`
+- **Dependencies**: None (core functionality)
+- **Flow**: Register objects → Retrieve by type/name → Manage lifecycle
+
+#### `entities/solution.py` - Solution Container
+- **What it does**: Represents complete ML solutions from planning to deployment
+- **Key classes**: `Solution`
+- **Dependencies**: `internal.models.entities.artifact`, `internal.models.entities.metric`
+- **Flow**: Plan → Code → Execution → Results → Artifacts
+
+#### `interfaces/` - Core Interfaces
+- **`predictor.py`**: Abstract base for inference code (`Predictor`)
+- **`feature_transformer.py`**: Abstract base for feature transformers (`FeatureTransformer`)
+- **Dependencies**: `internal.models.entities.artifact`, `pandas`
+- **Purpose**: Define contracts for generated code
+
+---
+
+### **C - Configuration** (`plexe/config.py`)
+**Purpose**: Central configuration management
+
+- **What it does**: Loads configuration from YAML files and environment variables
+- **Key components**: `config` object, `prompt_templates` object
+- **Dependencies**: `yaml`, `pathlib`
+- **Flow**: YAML files → Configuration object → Application settings
+
+---
+
+### **D - Datasets** (`plexe/datasets.py`)
+**Purpose**: Dataset generation and management
+
+- **What it does**: Provides `DatasetGenerator` class for synthetic data creation
+- **Key classes**: `DatasetGenerator`
+- **Dependencies**: `pandas`, `numpy`, `internal.common.datasets.interface`
+- **Flow**: Schema → Synthetic data → TabularConvertible dataset
+
+---
+
+### **E - Entry Points**
+**Purpose**: Application entry points and main interfaces
+
+#### `main.py` - CLI Entry Point
+- **What it does**: Launches Plexe assistant with web UI
+- **Dependencies**: `server`, `uvicorn`, `webbrowser`
+- **Flow**: Start server → Open browser → Keep running
+
+#### `server.py` - Web Server
+- **What it does**: FastAPI server for conversational agent
+- **Dependencies**: `agents.conversational`, `fastapi`, `websockets`
+- **Flow**: WebSocket connection → Agent processing → Response
+
+#### `model_builder.py` - Core API
+- **What it does**: Main API for building ML models programmatically
+- **Key classes**: `ModelBuilder`
+- **Dependencies**: All agents, core modules, tools, callbacks
+- **Flow**: Intent + Data → Agent orchestration → Complete model
+
+---
+
+### **F - File I/O** (`plexe/fileio.py`)
+**Purpose**: High-level file operations for models and checkpoints
+
+- **What it does**: Provides user-friendly functions for model persistence
+- **Key functions**: `save_model()`, `load_model()`, `save_checkpoint()`, `load_checkpoint()`
+- **Dependencies**: `core.storage`, `models`
+- **Flow**: Model object → Storage functions → File system
+
+---
+
+### **G - Internal Common** (`plexe/internal/common/`)
+**Purpose**: Shared utilities and common functionality
+
+#### `provider.py` - LLM Provider Management
+- **What it does**: Manages different LLM providers with retry logic
+- **Key classes**: `ProviderConfig`, `Provider`
+- **Dependencies**: `litellm`, `tenacity`, `pydantic`
+- **Flow**: Provider config → API calls → Response with retries
+
+#### `datasets/` - Dataset Interfaces and Adapters
+- **`interface.py`**: Core dataset interfaces (`Dataset`, `TabularConvertible`, etc.)
+- **`adapter.py`**: Adapter for converting between dataset types
+- **`tabular.py`**: Tabular dataset implementation
+- **Dependencies**: `pandas`, `numpy`, `abc`
+- **Flow**: Data → Interface implementation → Convertible formats
+
+#### `utils/` - Utility Functions
+- **`agents.py`**: Agent utility functions
+- **`chain_of_thought/`**: Chain of thought logging utilities
+- **`dataset_storage.py`**: Dataset storage utilities
+- **`dependency_utils.py`**: Dependency management
+- **`markdown_utils.py`**: Markdown formatting
+- **`model_state.py`**: Model state utilities
+- **`model_utils.py`**: Model utility functions
+- **`pandas_utils.py`**: Pandas utilities
+- **`prompt_utils.py`**: Prompt template utilities
+- **`pydantic_utils.py`**: Pydantic utilities
+- **`response.py`**: Response formatting utilities
+
+---
+
+### **H - Internal Models** (`plexe/internal/models/`)
+**Purpose**: Internal model entities and execution logic
+
+#### `entities/` - Model Entities
+- **`artifact.py`**: Represents model artifacts (files, data, handles)
+- **`code.py`**: Represents code objects with performance metrics
+- **`description.py`**: Model description entities
+- **`metric.py`**: Metric comparison and evaluation logic
+- **Dependencies**: `pathlib`, `io`, `dataclasses`
+
+#### `execution/` - Code Execution
+- **`executor.py`**: Base executor interface
+- **`process_executor.py`**: Process-based execution
+- **`docker_executor.py`**: Docker-based execution
+- **`ray_executor.py`**: Ray-based distributed execution
+- **Dependencies**: `subprocess`, `docker`, `ray`
+
+#### `generation/` - Code Generation
+- **`planning.py`**: Solution planning logic
+- **`review.py`**: Code review functionality
+- **`training.py`**: Training code generation
+- **Dependencies**: `internal.common.provider`, `internal.common.utils`
+
+#### `validation/` - Code Validation
+- **`validator.py`**: Main validation orchestrator
+- **`composite.py`**: Composite validators
+- **`primitives/`**: Basic validation primitives
+- **Dependencies**: `ast`, `importlib`, `subprocess`
+
+#### `callbacks/` - Execution Callbacks
+- **`chain_of_thought.py`**: Chain of thought logging
+- **`checkpoint.py`**: Model checkpointing
+- **`mlflow.py`**: MLflow integration
+- **Dependencies**: `mlflow`, `plexe.core.object_registry`
+
+---
+
+### **I - Internal Datasets** (`plexe/internal/datasets/`)
+**Purpose**: Dataset generation and validation
+
+#### `core/` - Core Dataset Logic
+- **`generation/`**: Dataset generation utilities
+- **`validation/`**: Dataset validation logic
+- **Dependencies**: `pandas`, `numpy`, `sklearn`
+
+#### `generator.py` - Dataset Generator
+- **What it does**: Main dataset generation interface
+- **Dependencies**: `core.generation`, `core.validation`
+
+#### `config.py` - Dataset Configuration
+- **What it does**: Dataset generation configuration
+- **Dependencies**: `pydantic`
+
+---
+
+### **J - Internal Schemas** (`plexe/internal/schemas/`)
+**Purpose**: Schema resolution and management
+
+#### `resolver.py` - Schema Resolver
+- **What it does**: Resolves and validates schemas from data
+- **Dependencies**: `pandas`, `pydantic`, `internal.common.utils`
+
+---
+
+### **K - Models** (`plexe/models.py`)
+**Purpose**: Main Model class and related functionality
+
+- **What it does**: Defines the main `Model` class representing a complete ML model
+- **Key classes**: `Model`
+- **Dependencies**: `core.state`, `core.interfaces`, `internal.models.entities`
+- **Flow**: Model creation → Training → Inference → Persistence
+
+---
+
+### **L - Tools** (`plexe/tools/`)
+**Purpose**: Tool functions used by agents
+
+#### `code_analysis.py` - Code Analysis Tools
+- **What it does**: Analyzes and extracts code from solutions
+- **Dependencies**: `core.object_registry`, `internal.models.entities.code`
+
+#### `context.py` - Context Tools
+- **What it does**: Provides context for code generation
+- **Dependencies**: `core.object_registry`, `internal.common.provider`
+
+#### `conversation.py` - Conversation Tools
+- **What it does**: Tools for conversational agent
+- **Dependencies**: `core.object_registry`, `model_builder`
+
+#### `datasets.py` - Dataset Tools
+- **What it does**: Dataset manipulation and registration tools
+- **Dependencies**: `core.object_registry`, `internal.common.datasets`
+
+#### `evaluation.py` - Evaluation Tools
+- **What it does**: Model evaluation and review tools
+- **Dependencies**: `core.object_registry`, `internal.common.provider`
+
+#### `execution.py` - Execution Tools
+- **What it does**: Code execution and training tools
+- **Dependencies**: `core.object_registry`, `internal.models.execution`
+
+#### `metrics.py` - Metrics Tools
+- **What it does**: Metric selection and comparison tools
+- **Dependencies**: `core.object_registry`, `internal.common.provider`
+
+#### `response_formatting.py` - Response Formatting
+- **What it does**: Formats agent responses
+- **Dependencies**: `core.object_registry`, `internal.common.provider`
+
+#### `schemas.py` - Schema Tools
+- **What it does**: Schema inference and validation tools
+- **Dependencies**: `core.object_registry`, `internal.common.utils`
+
+#### `solutions.py` - Solution Management
+- **What it does**: Solution creation and management tools
+- **Dependencies**: `core.object_registry`, `core.entities.solution`
+
+#### `testing.py` - Testing Tools
+- **What it does**: Model testing and evaluation tools
+- **Dependencies**: `core.object_registry`, `internal.common.provider`
+
+#### `training.py` - Training Tools
+- **What it does**: Training code generation and management
+- **Dependencies**: `core.object_registry`, `internal.common.provider`
+
+#### `validation.py` - Validation Tools
+- **What it does**: Code and model validation tools
+- **Dependencies**: `core.object_registry`, `internal.models.validation`
+
+---
+
+### **M - Templates** (`plexe/templates/`)
+**Purpose**: Jinja2 templates for prompts and code generation
+
+#### `models/` - Model Templates
+- **`feature_transformer.tmpl.py`**: Feature transformer code template
+- **`predictor.tmpl.py`**: Predictor code template
+
+#### `prompts/` - Prompt Templates
+- **`agent/`**: Agent-specific prompt templates
+- **`planning/`**: Planning prompt templates
+- **`review/`**: Review prompt templates
+- **`schemas/`**: Schema prompt templates
+- **`training/`**: Training prompt templates
+- **`utils/`**: Utility prompt templates
+
+---
+
+### **N - UI** (`plexe/ui/`)
+**Purpose**: Web user interface
+
+#### `index.html` - Web Interface
+- **What it does**: HTML interface for conversational agent
+- **Dependencies**: WebSocket connection to server
+- **Flow**: User input → WebSocket → Agent → Response
+
+---
+
+### **O - Callbacks** (`plexe/callbacks.py`)
+**Purpose**: Callback system for model building lifecycle
+
+- **What it does**: Defines callback interface and implementations
+- **Key classes**: `Callback`, `MLFlowCallback`, `ModelCheckpointCallback`
+- **Dependencies**: `mlflow`, `core.object_registry`
+- **Flow**: Model events → Callback execution → External systems
+
+---
+
+### **P - Tests** (`tests/`)
+**Purpose**: Comprehensive test suite
+
+#### `unit/` - Unit Tests
+- **What it does**: Tests individual components in isolation
+- **Coverage**: Core modules, internal utilities, models
+- **Dependencies**: `pytest`, individual modules
+
+#### `integration/` - Integration Tests
+- **What it does**: End-to-end tests for complete ML workflows
+- **Coverage**: Binary classification, regression, time series, etc.
+- **Dependencies**: `pytest`, full plexe system
+
+#### `benchmark/` - Benchmark Tests
+- **What it does**: Performance benchmarks using MLE-Bench
+- **Coverage**: Kaggle competitions, model performance
+- **Dependencies**: `mle-bench`, `kaggle`
+
+#### `fixtures/` - Test Fixtures
+- **What it does**: Test data and model artifacts
+- **Contents**: Legacy model files for compatibility testing
+
+---
+
+## Dependency Flow Diagram
+
+```
+Entry Points (main.py, server.py, model_builder.py)
+    ↓
+Core System (state.py, storage.py, object_registry.py)
+    ↓
+Agents (8 specialized agents)
+    ↓
+Tools (14 tool modules)
+    ↓
+Internal Modules (common, models, datasets, schemas)
+    ↓
+External Dependencies (pandas, sklearn, litellm, etc.)
+```
+
+## Key Dependencies
+
+### External Dependencies
+- **pandas**: Data manipulation
+- **scikit-learn**: Machine learning algorithms
+- **litellm**: LLM provider abstraction
+- **smolagents**: Agent framework
+- **pydantic**: Data validation
+- **fastapi**: Web server
+- **mlflow**: Experiment tracking
+
+### Internal Dependencies
+- **ObjectRegistry**: Central object storage
+- **Provider**: LLM provider management
+- **Solution**: ML solution container
+- **Artifact**: Model artifact representation
+- **Metric**: Performance metric handling
+
+## Data Flow
+
+1. **Input**: User intent + datasets
+2. **Analysis**: EDA agent analyzes data
+3. **Planning**: Research agent creates solution plans
+4. **Feature Engineering**: Feature agent transforms data
+5. **Splitting**: Splitter agent creates train/val/test sets
+6. **Training**: Engineer agent implements and trains models
+7. **Packaging**: Ops agent creates inference code
+8. **Testing**: Tester agent evaluates final model
+9. **Output**: Complete model with artifacts and code
+
+## Architecture Patterns
+
+- **Multi-Agent System**: Specialized agents for different tasks
+- **Object Registry**: Centralized object storage and retrieval
+- **Tool-Based Architecture**: Agents use tools for capabilities
+- **Template System**: Jinja2 templates for code generation
+- **Callback System**: Lifecycle event handling
+- **Interface Segregation**: Clear contracts between components
+- **Dependency Injection**: Loose coupling between modules
diff --git a/COMPLETE_MODEL_BUILDING_WALKTHROUGH.md b/COMPLETE_MODEL_BUILDING_WALKTHROUGH.md
new file mode 100644
index 00000000..88a402b1
--- /dev/null
+++ b/COMPLETE_MODEL_BUILDING_WALKTHROUGH.md
@@ -0,0 +1,703 @@
+# Complete Model Building Walkthrough: House Price Prediction Example
+
+This document provides a step-by-step walkthrough of how the Plexe system builds a machine learning model, tracing every file and code path from user input to final prediction.
+
+## Example Scenario
+**User Goal**: Build a model to predict house prices based on features like bedrooms, bathrooms, and square footage.
+
+**User Code**:
+```python
+from plexe import ModelBuilder
+import pandas as pd
+
+# Load dataset
+df = pd.read_csv("houses.csv")  # Contains: bedrooms, bathrooms, square_footage, price
+
+# Build model
+builder = ModelBuilder(provider="openai/gpt-4o-mini", verbose=True)
+model = builder.build(
+    intent="Given a dataset of house features, predict the house price.",
+    datasets=[df],
+    input_schema={"bedrooms": int, "bathrooms": int, "square_footage": float},
+    output_schema={"price": float},
+    max_iterations=3
+)
+
+# Make prediction
+prediction = model.predict({"bedrooms": 3, "bathrooms": 2, "square_footage": 1500.0})
+print(prediction)  # {"price": 250000.0}
+```
+
+---
+
+## Phase 1: User Initialization
+
+### Step 1.1: User Creates ModelBuilder
+**File**: `plexe/model_builder.py` (lines 37-56)
+
+**What Happens**:
+- User calls `ModelBuilder(provider="openai/gpt-4o-mini", verbose=True)`
+- `__init__` method executes:
+  - Creates `ProviderConfig` from provider string (line 53)
+  - Sets `self.verbose = True`
+  - Sets `self.distributed = False`
+  - Calls `_create_working_dir()` which creates `./workdir/run-{timestamp}/` directory
+  - Working directory path stored in `self.working_dir`
+
+**Output**: ModelBuilder instance ready with provider config and working directory
+
+---
+
+### Step 1.2: User Calls build() Method
+**File**: `plexe/model_builder.py` (lines 66-254)
+
+**What Happens**:
+1. **Registry Initialization** (lines 95-97):
+   - Gets singleton `ObjectRegistry()` instance
+   - Clears registry: `object_registry.clear()`
+   - Registry is now empty and ready
+
+2. **Parameter Validation** (lines 99-103):
+   - Checks `timeout` or `max_iterations` is set
+   - Validates `run_timeout <= timeout` if both provided
+
+3. **Schema Processing** (lines 105-110):
+   - Calls `map_to_basemodel("in", input_schema)` from `plexe/internal/common/utils/pydantic_utils.py`
+   - **File**: `pydantic_utils.py` (lines 51-102):
+     - Converts dict `{"bedrooms": int, ...}` to Pydantic `BaseModel` class
+     - Creates model named "in" with fields: bedrooms (int), bathrooms (int), square_footage (float)
+     - Validates types are allowed (int, float, str, bool, List variants)
+   - Same process for `output_schema` → creates "out" model with `price: float`
+   - Registers schema locks in registry:
+     - `object_registry.register(bool, "input_schema_is_locked", True, immutable=True)`
+     - `object_registry.register(bool, "output_schema_is_locked", True, immutable=True)`
+
+4. **Callback Setup** (lines 112-122):
+   - Initializes callbacks list
+   - Adds `ModelCheckpointCallback` if `enable_checkpointing=True`
+   - Creates `ChainOfThoughtModelCallback` with `ConsoleEmitter`
+   - Registers all callbacks in ObjectRegistry
+
+5. **Dataset Registration** (lines 125-136):
+   - Converts pandas DataFrame to `DatasetAdapter` via `DatasetAdapter.coerce(df)`
+   - **File**: `plexe/internal/common/datasets/adapter.py`
+   - Creates `TabularDataset` wrapper around DataFrame
+   - Registers as `dataset_0` in ObjectRegistry with `immutable=True`
+   - Formats schemas using `format_schema()` and registers in registry:
+     - `object_registry.register(dict, "input_schema", {"bedrooms": "int", ...})`
+     - `object_registry.register(dict, "output_schema", {"price": "float"})`
+
+6. **Model Identifier Generation** (line 139):
+   - Creates unique ID: `model-{timestamp}`
+
+7. **Callback Notification - Build Start** (lines 141-151):
+   - Calls `_notify_callbacks()` with event="build_start"
+   - All registered callbacks receive `BuildStateInfo` with intent, schemas, datasets
+
+8. **Agent Creation** (lines 154-165):
+   - Creates `PlexeAgent` instance (this is the multi-agent orchestrator)
+   - **File**: `plexe/agents/agents.py` (lines 63-194)
+
+---
+
+## Phase 2: Multi-Agent System Initialization
+
+### Step 2.1: PlexeAgent Initialization
+**File**: `plexe/agents/agents.py` (lines 63-194)
+
+**What Happens**:
+The `PlexeAgent.__init__` creates all specialist agents:
+
+1. **ML Research Agent** (lines 105-110):
+   - **File**: `plexe/agents/model_planner.py`
+   - Creates `ModelPlannerAgent` with `ml_researcher_model_id`
+   - This agent will plan ML solution approaches
+
+2. **Schema Resolver Agent** (lines 113-117):
+   - **File**: `plexe/agents/schema_resolver.py`
+   - Creates `SchemaResolverAgent` with `orchestrator_model_id`
+   - Will infer schemas if not provided (already provided in our case)
+
+3. **EDA Agent** (lines 120-124):
+   - **File**: `plexe/agents/dataset_analyser.py`
+   - Creates `EdaAgent` with `orchestrator_model_id`
+   - Will analyze dataset structure and generate insights
+
+4. **Feature Engineering Agent** (lines 127-131):
+   - **File**: `plexe/agents/feature_engineer.py`
+   - Creates `FeatureEngineeringAgent` with `ml_engineer_model_id`
+   - Will transform raw data into better features
+
+5. **Dataset Splitter Agent** (lines 134-138):
+   - **File**: `plexe/agents/dataset_splitter.py`
+   - Creates `DatasetSplitterAgent` with `orchestrator_model_id`
+   - Will split data into train/validation/test sets
+
+6. **Model Trainer Agent** (lines 141-148):
+   - **File**: `plexe/agents/model_trainer.py`
+   - Creates `ModelTrainerAgent` with `ml_engineer_model_id` and `tool_model_id`
+   - Will implement and execute training code
+
+7. **Model Packager Agent** (lines 151-157):
+   - **File**: `plexe/agents/model_packager.py`
+   - Creates `ModelPackagerAgent` with `ml_ops_engineer_model_id`
+   - Will generate inference/prediction code
+
+8. **Model Tester Agent** (lines 160-164):
+   - **File**: `plexe/agents/model_tester.py`
+   - Creates `ModelTesterAgent` with `ml_engineer_model_id`
+   - Will test and evaluate the final model
+
+9. **Orchestrator Agent** (lines 167-194):
+   - Creates `CodeAgent` (from smolagents library) named "Orchestrator"
+   - Configures with tools:
+     - `get_select_target_metric()` - selects evaluation metric
+     - `get_review_finalised_model()` - reviews final model
+     - `get_latest_datasets` - retrieves datasets from registry
+     - `get_solution_performances` - gets performance metrics
+     - `register_best_solution` - marks best solution
+     - `format_final_orchestrator_agent_response` - formats final response
+   - Registers all specialist agents as `managed_agents`
+   - Sets `max_steps=30`, `planning_interval=7`
+
+**Output**: Fully initialized multi-agent system with 8 specialist agents + 1 orchestrator
+
+---
+
+### Step 2.2: Agent Prompt Generation
+**File**: `plexe/model_builder.py` (lines 167-185)
+
+**What Happens**:
+- Calls `prompt_templates.agent_builder_prompt()` with:
+  - intent: "Given a dataset of house features, predict the house price."
+  - input_schema: JSON string of schema dict
+  - output_schema: JSON string of schema dict
+  - datasets: ["dataset_0"]
+  - working_dir: "./workdir/run-{timestamp}/"
+  - max_iterations: 3
+- **File**: `plexe/config/prompt_templates.py` (or similar)
+- Generates comprehensive prompt instructing orchestrator on the task
+- Creates `additional_args` dict with all context
+
+---
+
+## Phase 3: Orchestrator Agent Execution
+
+### Step 3.1: Orchestrator Starts Workflow
+**File**: `plexe/agents/agents.py` (lines 196-204)
+
+**What Happens**:
+- Calls `agent.run(agent_prompt, additional_args=additional_args)`
+- Orchestrator (CodeAgent) begins executing steps
+- Uses planning every 7 steps to decide next actions
+- Can call tools or delegate to specialist agents
+
+**Typical Orchestrator Workflow**:
+1. Select target metric (e.g., RMSE for regression)
+2. Analyze dataset (delegate to EDA agent)
+3. Resolve schemas if needed (delegate to SchemaResolver - skipped if provided)
+4. Plan solutions (delegate to ML Research agent)
+5. Split dataset (delegate to Dataset Splitter agent)
+6. Train models (delegate to ML Engineer agent)
+7. Package models (delegate to ML Ops agent)
+8. Test models (delegate to Model Tester agent)
+9. Select best solution
+
+---
+
+## Phase 4: Specialist Agent Execution
+
+### Step 4.1: Target Metric Selection
+**Tool**: `get_select_target_metric()` from `plexe/tools/metrics.py`
+
+**What Happens**:
+- Orchestrator calls this tool
+- Tool analyzes intent and output schema
+- Determines this is a regression problem (output is float)
+- Selects appropriate metric (e.g., RMSE, MAE)
+- Registers metric in ObjectRegistry
+
+---
+
+### Step 4.2: EDA Agent Execution
+**File**: `plexe/agents/dataset_analyser.py`
+
+**What Happens**:
+1. Orchestrator delegates task: "Analyze dataset_0"
+2. EDA Agent receives task
+3. Agent calls `get_latest_datasets()` tool:
+   - **File**: `plexe/tools/datasets.py` (lines 394-451)
+   - Retrieves `dataset_0` from ObjectRegistry
+   - Returns dict: `{"raw": "dataset_0"}`
+4. Agent calls `get_dataset_preview()` tool:
+   - **File**: `plexe/tools/datasets.py` (lines 224-283)
+   - Gets dataset from registry: `object_registry.get(TabularConvertible, "dataset_0")`
+   - Converts to pandas: `dataset.to_pandas()`
+   - Generates preview with shape, dtypes, sample rows, statistics, missing values
+   - Returns preview dict
+5. Agent analyzes data using LLM (via Provider):
+   - **File**: `plexe/internal/common/provider.py`
+   - Makes LLM call with system prompt + data preview
+   - LLM generates EDA insights
+6. Agent calls `register_eda_report()` tool:
+   - **File**: `plexe/tools/datasets.py` (lines 287-341)
+   - Creates structured EDA report dict with:
+     - overview: dataset stats, target variable analysis
+     - feature_engineering_opportunities: transformation needs
+     - data_quality_challenges: data issues
+     - data_preprocessing_requirements: preprocessing steps
+     - feature_importance: predictive potential
+     - insights: key findings
+     - recommendations: actionable steps
+   - Registers in ObjectRegistry: `object_registry.register(dict, "eda_report_dataset_0", report)`
+
+**Output**: EDA report registered in ObjectRegistry
+
+---
+
+### Step 4.3: Schema Resolution (Skipped)
+**File**: `plexe/agents/schema_resolver.py`
+
+**What Happens**:
+- Orchestrator checks if schemas are locked
+- Finds `input_schema_is_locked = True` in registry
+- Skips schema resolution (schemas already provided)
+- If schemas weren't provided, this agent would:
+  - Analyze intent and sample data
+  - Infer input/output schemas
+  - Register them in ObjectRegistry
+
+---
+
+### Step 4.4: ML Research Agent - Solution Planning
+**File**: `plexe/agents/model_planner.py`
+
+**What Happens**:
+1. Orchestrator delegates: "Plan ML solutions for house price prediction"
+2. Agent retrieves context:
+   - Gets EDA report: `object_registry.get(dict, "eda_report_dataset_0")`
+   - Gets schemas from registry
+   - Gets datasets: `get_latest_datasets()`
+3. Agent uses LLM to generate solution plans:
+   - Analyzes problem type (regression)
+   - Considers dataset characteristics from EDA
+   - Generates multiple solution approaches:
+     - Solution 1: Linear Regression
+     - Solution 2: Random Forest
+     - Solution 3: Gradient Boosting (XGBoost)
+   - Each solution includes:
+     - Model type and framework
+     - Rationale
+     - Expected performance
+     - Implementation approach
+4. Agent registers solution plans in ObjectRegistry
+5. **File**: `plexe/core/entities/solution.py` - Solution objects created
+
+**Output**: Multiple solution plans registered
+
+---
+
+### Step 4.5: Feature Engineering Agent (Optional)
+**File**: `plexe/agents/feature_engineer.py`
+
+**What Happens** (if orchestrator decides feature engineering is needed):
+1. Agent retrieves EDA report and raw dataset
+2. Analyzes feature engineering opportunities from EDA
+3. Generates transformation code (e.g., normalization, encoding)
+4. Applies transformations to create new dataset
+5. Registers transformed dataset: `dataset_0_transformed`
+6. Registers transformation code in ObjectRegistry
+7. Calls `register_feature_engineering_report()`:
+   - **File**: `plexe/tools/datasets.py` (lines 344-390)
+   - Creates feature engineering report
+   - Registers in ObjectRegistry
+
+**Output**: Transformed dataset and feature engineering code (if applicable)
+
+---
+
+### Step 4.6: Dataset Splitter Agent
+**File**: `plexe/agents/dataset_splitter.py`
+
+**What Happens**:
+1. Orchestrator delegates: "Split dataset into train/validation/test"
+2. Agent calls `get_dataset_for_splitting()`:
+   - **File**: `plexe/tools/datasets.py` (lines 455-507)
+   - Finds best dataset (prefers transformed, falls back to raw)
+   - Returns "dataset_0" (or "dataset_0_transformed" if exists)
+3. Agent retrieves dataset from registry
+4. Agent generates splitting strategy using LLM:
+   - Analyzes dataset size and characteristics
+   - Decides split ratios (e.g., 70/15/15 or 80/10/10)
+   - Considers time-series vs random split needs
+5. Agent generates splitting code (Python)
+6. Agent executes splitting code to create splits
+7. Agent calls `register_split_datasets()`:
+   - **File**: `plexe/tools/datasets.py` (lines 27-92)
+   - Registers three datasets:
+     - `dataset_0_train` (TabularConvertible)
+     - `dataset_0_val` (TabularConvertible)
+     - `dataset_0_test` (TabularConvertible)
+   - Registers splitting code: `object_registry.register(Code, "dataset_splitting_code", Code(code))`
+
+**Output**: Train/validation/test datasets registered in ObjectRegistry
+
+---
+
+### Step 4.7: Model Trainer Agent - Training Implementation
+**File**: `plexe/agents/model_trainer.py`
+
+**What Happens** (for each solution plan):
+1. Orchestrator delegates: "Train Solution 1: Linear Regression"
+2. Agent retrieves:
+   - Training datasets: `get_training_datasets()` → `{"train": "dataset_0_train", "validation": "dataset_0_val"}`
+   - Solution plan from registry
+   - EDA report and feature engineering report (if available)
+3. Agent generates training code using LLM:
+   - **File**: Uses `tool_model_id` for code generation
+   - Creates Python code that:
+     - Loads train/val datasets
+     - Implements model (e.g., sklearn LinearRegression)
+     - Trains model on training set
+     - Evaluates on validation set
+     - Saves model artifacts
+   - Code is validated and formatted
+4. Agent executes training code:
+   - **File**: `plexe/tools/execution.py`
+   - Code runs in isolated environment
+   - Model trains on `dataset_0_train`
+   - Validates on `dataset_0_val`
+   - Generates performance metric (e.g., RMSE = 25000.0)
+   - Saves model artifacts (pickle files, weights, etc.)
+5. Agent registers results:
+   - **File**: `plexe/tools/training.py`
+   - Creates `Solution` object with:
+     - training_code: generated Python code string
+     - performance: Metric object (name="RMSE", value=25000.0)
+     - model_artifacts: List[Artifact] (file paths to saved models)
+   - Registers Solution in ObjectRegistry
+6. Process repeats for each solution (Solution 2: Random Forest, Solution 3: XGBoost)
+
+**Output**: Multiple trained solutions with performance metrics in ObjectRegistry
+
+---
+
+### Step 4.8: Model Packager Agent - Inference Code Generation
+**File**: `plexe/agents/model_packager.py`
+
+**What Happens** (for each solution):
+1. Orchestrator delegates: "Generate inference code for Solution 1"
+2. Agent retrieves:
+   - Solution object with training code and artifacts
+   - Input/output schemas from registry
+   - Feature transformer code (if exists)
+3. Agent generates inference code using LLM:
+   - Creates Python class `PredictorImplementation` that:
+     - Inherits from `Predictor` interface
+     - **File**: `plexe/core/interfaces/predictor.py`
+     - Implements `__init__(self, artifacts: List[Artifact])`
+     - Implements `predict(self, inputs: dict) -> dict`
+   - Code loads model artifacts
+   - Applies feature transformations (if any)
+   - Makes predictions
+   - Returns dict matching output schema
+4. Agent validates inference code:
+   - **File**: `plexe/tools/validation.py`
+   - Creates sample inputs using `create_input_sample()`
+   - **File**: `plexe/tools/datasets.py` (lines 97-151)
+   - Tests prediction on sample inputs
+   - Validates output matches schema
+5. Agent registers inference code:
+   - Updates Solution object with `inference_code` attribute
+   - Code stored as string in Solution
+
+**Output**: Inference code for each solution registered
+
+---
+
+### Step 4.9: Model Tester Agent - Final Evaluation
+**File**: `plexe/agents/model_tester.py`
+
+**What Happens**:
+1. Orchestrator delegates: "Test all solutions on test set"
+2. Agent retrieves:
+   - Test dataset: `get_test_dataset()` → "dataset_0_test"
+   - All solution objects from registry
+3. For each solution:
+   - Agent generates testing code
+   - Code loads model and inference code
+   - Runs predictions on entire test set
+   - Calculates final performance metrics
+   - Generates evaluation report (confusion matrix, feature importance, etc.)
+4. Agent updates Solution objects with:
+   - `testing_code`: testing code string
+   - `model_evaluation_report`: detailed evaluation dict
+   - Test performance metrics
+
+**Output**: All solutions tested and evaluated
+
+---
+
+## Phase 5: Solution Selection and Finalization
+
+### Step 5.1: Best Solution Selection
+**Tool**: `get_solution_performances()` from `plexe/tools/evaluation.py`
+
+**What Happens**:
+1. Orchestrator calls tool to get all solution performances
+2. Tool retrieves all Solution objects from ObjectRegistry
+3. Extracts performance metrics from each
+4. Returns comparison of all solutions
+
+**Tool**: `register_best_solution()` from `plexe/tools/training.py`
+
+**What Happens**:
+1. Orchestrator compares all solutions
+2. Selects best based on metric (e.g., lowest RMSE)
+3. Registers best solution: `object_registry.register(Solution, "best_performing_solution", best_solution)`
+
+---
+
+### Step 5.2: Final Response Formatting
+**Tool**: `format_final_orchestrator_agent_response()` from `plexe/tools/response_formatting.py`
+
+**What Happens**:
+1. Orchestrator calls tool to format final response
+2. Tool collects:
+   - Best solution details
+   - Performance metrics
+   - Model metadata
+3. Formats into structured response dict
+4. Returns to orchestrator
+
+---
+
+### Step 5.3: PlexeAgent.run() Completion
+**File**: `plexe/agents/agents.py` (lines 196-305)
+
+**What Happens**:
+1. Orchestrator finishes execution
+2. `agent.run()` returns result (AgentText or dict)
+3. Code extracts best solution (lines 217-219):
+   - `best_solution = object_registry.get(Solution, "best_performing_solution")`
+   - `training_code = best_solution.training_code`
+   - `inference_code = best_solution.inference_code`
+4. Extracts performance metrics (lines 222-245):
+   - Gets metric from result or solution
+   - Creates `Metric` object with name, value, comparator
+5. Compiles inference code (lines 251-255):
+   - Creates new module: `types.ModuleType("predictor")`
+   - Executes inference code string in module namespace
+   - Gets `PredictorImplementation` class from module
+   - Instantiates: `predictor = predictor_class(best_solution.model_artifacts)`
+6. Extracts additional code (lines 258-291):
+   - Feature transformer code (if exists)
+   - Dataset split code (if exists)
+   - Testing code (if exists)
+   - Evaluation report (if exists)
+7. Creates `ModelGenerationResult` (lines 293-305):
+   - training_source_code: Python code string
+   - inference_source_code: Python code string
+   - feature_transformer_source_code: Python code string (or None)
+   - dataset_split_code: Python code string (or None)
+   - predictor: Predictor instance (ready to use)
+   - model_artifacts: List[Artifact]
+   - performance: Metric object
+   - test_performance: Metric object
+   - testing_source_code: Python code string (or None)
+   - evaluation_report: Dict (or None)
+   - metadata: Dict with model info
+
+**Output**: `ModelGenerationResult` object returned to ModelBuilder
+
+---
+
+## Phase 6: ModelBuilder Finalization
+
+### Step 6.1: Extract Final Schemas
+**File**: `plexe/model_builder.py` (lines 189-192)
+
+**What Happens**:
+- Calls `get_solution_schemas("best_performing_solution")`
+- **File**: `plexe/tools/schemas.py`
+- Retrieves final input/output schemas (may have been refined)
+- Converts to Pydantic models using `map_to_basemodel()`
+
+---
+
+### Step 6.2: Build Metadata
+**File**: `plexe/model_builder.py` (lines 194-217)
+
+**What Happens**:
+1. Creates metadata dict with provider info
+2. Updates with metadata from `ModelGenerationResult`
+3. Extracts EDA reports from ObjectRegistry:
+   - Gets `eda_report_dataset_0` from registry
+   - Formats as markdown using `format_eda_report_markdown()`
+   - Adds to metadata
+
+---
+
+### Step 6.3: Create Model Instance
+**File**: `plexe/model_builder.py` (lines 219-236)
+
+**What Happens**:
+1. Imports `Model` class: `from plexe.models import Model`
+2. Creates Model instance:
+   ```python
+   model = Model(
+       intent=intent,
+       input_schema=final_input_schema,
+       output_schema=final_output_schema
+   )
+   ```
+3. **File**: `plexe/models.py` (lines 93-138)
+   - `__init__` executes:
+     - Stores intent, schemas
+     - Initializes `state = ModelState.DRAFT`
+     - Creates unique identifier
+     - Creates working directory
+     - Initializes empty predictor, artifacts, metrics
+4. Populates model with results:
+   - `model.identifier = model_identifier`
+   - `model.predictor = generated.predictor` (Predictor instance)
+   - `model.trainer_source = generated.training_source_code`
+   - `model.predictor_source = generated.inference_source_code`
+   - `model.feature_transformer_source = generated.feature_transformer_source_code`
+   - `model.dataset_splitter_source = generated.dataset_split_code`
+   - `model.testing_source = generated.testing_source_code`
+   - `model.artifacts = generated.model_artifacts`
+   - `model.metric = generated.test_performance`
+   - `model.evaluation_report = generated.evaluation_report`
+   - `model.metadata.update(metadata)`
+   - `model.training_data = training_data` (actual Dataset objects)
+   - `model.state = ModelState.READY`
+
+---
+
+### Step 6.4: Final Callback Notification
+**File**: `plexe/model_builder.py` (lines 238-252)
+
+**What Happens**:
+- Calls `_notify_callbacks()` with event="build_end"
+- All callbacks receive final BuildStateInfo with:
+  - Completed model
+  - Final metrics
+  - Source code
+  - Artifacts
+
+---
+
+### Step 6.5: Return Model
+**File**: `plexe/model_builder.py` (line 254)
+
+**What Happens**:
+- Returns completed `Model` instance to user
+- Model is in `ModelState.READY` state
+- All code, artifacts, and metadata populated
+
+---
+
+## Phase 7: User Makes Prediction
+
+### Step 7.1: User Calls model.predict()
+**File**: `plexe/models.py` (lines 227-245)
+
+**What Happens**:
+1. User calls: `model.predict({"bedrooms": 3, "bathrooms": 2, "square_footage": 1500.0})`
+2. `predict()` method executes:
+   - Checks `self.state == ModelState.READY` (line 235)
+   - If `validate_input=True`, validates input against `self.input_schema` (line 239)
+   - Calls `self.predictor.predict(x)` (line 240)
+     - **File**: `plexe/core/interfaces/predictor.py`
+     - This is the `PredictorImplementation` class compiled earlier
+     - Executes inference code:
+       - Loads model artifacts
+       - Applies feature transformations (if any)
+       - Runs model forward pass
+       - Returns prediction dict
+   - If `validate_output=True`, validates output against `self.output_schema` (line 242)
+   - Returns prediction dict: `{"price": 250000.0}`
+
+**Output**: Prediction dictionary matching output schema
+
+---
+
+## Complete File Trace Summary
+
+### User-Facing Files:
+1. `plexe/model_builder.py` - Main entry point
+2. `plexe/models.py` - Model class definition
+
+### Core System Files:
+3. `plexe/agents/agents.py` - Multi-agent orchestrator
+4. `plexe/core/object_registry.py` - Shared state registry
+5. `plexe/core/state.py` - Model state definitions
+6. `plexe/core/interfaces/predictor.py` - Predictor interface
+
+### Agent Files:
+7. `plexe/agents/model_planner.py` - Solution planning
+8. `plexe/agents/dataset_analyser.py` - EDA analysis
+9. `plexe/agents/schema_resolver.py` - Schema inference
+10. `plexe/agents/feature_engineer.py` - Feature engineering
+11. `plexe/agents/dataset_splitter.py` - Dataset splitting
+12. `plexe/agents/model_trainer.py` - Model training
+13. `plexe/agents/model_packager.py` - Inference code generation
+14. `plexe/agents/model_tester.py` - Model testing
+
+### Tool Files:
+15. `plexe/tools/datasets.py` - Dataset operations
+16. `plexe/tools/metrics.py` - Metric selection
+17. `plexe/tools/training.py` - Training operations
+18. `plexe/tools/execution.py` - Code execution
+19. `plexe/tools/validation.py` - Code validation
+20. `plexe/tools/evaluation.py` - Model evaluation
+21. `plexe/tools/schemas.py` - Schema operations
+22. `plexe/tools/response_formatting.py` - Response formatting
+
+### Utility Files:
+23. `plexe/internal/common/utils/pydantic_utils.py` - Schema conversion
+24. `plexe/internal/common/provider.py` - LLM provider interface
+25. `plexe/internal/common/datasets/adapter.py` - Dataset adapters
+26. `plexe/core/entities/solution.py` - Solution data structures
+27. `plexe/internal/models/entities/artifact.py` - Artifact definitions
+28. `plexe/internal/models/entities/metric.py` - Metric definitions
+
+---
+
+## Data Flow Diagram
+
+```
+User Input
+    ↓
+ModelBuilder.build()
+    ↓
+ObjectRegistry (cleared, then populated)
+    ↓
+PlexeAgent (orchestrator + 8 specialists)
+    ↓
+Orchestrator Agent (coordinates workflow)
+    ├─→ EDA Agent → EDA Report → Registry
+    ├─→ Schema Resolver → Schemas → Registry (if needed)
+    ├─→ ML Research Agent → Solution Plans → Registry
+    ├─→ Feature Engineer → Transformed Data → Registry (optional)
+    ├─→ Dataset Splitter → Train/Val/Test → Registry
+    ├─→ Model Trainer → Trained Models → Registry
+    ├─→ Model Packager → Inference Code → Registry
+    └─→ Model Tester → Evaluation Reports → Registry
+    ↓
+Best Solution Selected → Registry
+    ↓
+ModelGenerationResult (extracted from registry)
+    ↓
+Model Instance (populated with results)
+    ↓
+User receives Model (ready for predictions)
+    ↓
+model.predict() → Predictor.predict() → Prediction
+```
+
+---
+
+This walkthrough covers every major file and code path in the system, showing how data flows from user input through the multi-agent system to final predictions.
+
diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py
index ce313f05..4978bab5 100644
--- a/plexe/internal/models/entities/metric.py
+++ b/plexe/internal/models/entities/metric.py
@@ -22,8 +22,6 @@
 
 from enum import Enum
 from functools import total_ordering
-from typing import Optional
-from weakref import WeakValueDictionary
 
 
 class ComparisonMethod(Enum):
@@ -100,140 +98,32 @@ def compare(self, value1: float, value2: float) -> int:
             raise ValueError("Invalid comparison method.")
 
 
-# Internal cache for sharing MetricComparator instances across all metrics
-# This ensures only one comparator object exists per unique (method, target, epsilon) combination
-_comparator_cache: WeakValueDictionary = WeakValueDictionary()
-
-
-def _get_shared_comparator(
-    comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9
-) -> MetricComparator:
-    """
-    Get or create a shared MetricComparator instance.
-
-    This function ensures that identical comparators are reused across all Metric instances,
-    reducing memory usage and ensuring consistency.
-
-    :param comparison_method: The comparison method.
-    :param target: Optional target value for TARGET_IS_BETTER.
-    :param epsilon: Tolerance for floating-point comparisons.
-    :return: A shared MetricComparator instance.
-    """
-    # Create a cache key from the comparator parameters
-    cache_key = (comparison_method, target, epsilon)
-
-    # Try to get existing comparator from cache
-    if cache_key in _comparator_cache:
-        return _comparator_cache[cache_key]
-
-    # Create new comparator and cache it
-    comparator = MetricComparator(comparison_method, target, epsilon)
-    _comparator_cache[cache_key] = comparator
-    return comparator
-
-
-class _MetricDefinition:
-    """
-    Internal class representing a metric type definition.
-
-    This separates the metric definition (what it is) from the metric value (a measurement).
-    Metric definitions are immutable and can be shared across multiple metric values.
-
-    This is an internal implementation detail - users should not interact with this class directly.
-    """
-
-    def __init__(self, name: str, comparator: MetricComparator):
-        """
-        Initialize a metric definition.
-
-        :param name: The name of the metric.
-        :param comparator: The shared comparator instance.
-        """
-        self._name = name
-        self._comparator = comparator
-
-    @property
-    def name(self) -> str:
-        """The name of the metric."""
-        return self._name
-
-    @property
-    def comparator(self) -> MetricComparator:
-        """The shared comparator instance."""
-        return self._comparator
-
-    def __eq__(self, other) -> bool:
-        """Check if two metric definitions are equal."""
-        if not isinstance(other, _MetricDefinition):
-            return False
-        return (
-            self.name == other.name
-            and self.comparator.comparison_method == other.comparator.comparison_method
-            and self.comparator.target == other.comparator.target
-            and self.comparator.epsilon == other.comparator.epsilon
-        )
-
-    def __hash__(self) -> int:
-        """Hash the metric definition."""
-        return hash((self.name, self.comparator.comparison_method, self.comparator.target, self.comparator.epsilon))
-
-
+# todo: this class is a mess as it mixes concerns of a metric and a metric value; needs refactoring
 @total_ordering
 class Metric:
     """
     Represents a metric with a name, a value, and a comparator for determining which metric is better.
 
-    This class internally separates the metric definition (type) from the metric value (measurement),
-    and automatically shares comparator instances to reduce memory usage.
-
     Attributes:
         name (str): The name of the metric (e.g., 'accuracy', 'loss').
         value (float): The numeric value of the metric.
-        comparator (MetricComparator): The comparison logic for the metric (shared instance).
+        comparator (MetricComparator): The comparison logic for the metric.
     """
 
     def __init__(self, name: str, value: float = None, comparator: MetricComparator = None, is_worst: bool = False):
         """
         Initializes a Metric object.
 
-        The comparator instance is automatically shared with other metrics that have the same
-        comparison method, target, and epsilon values, reducing memory usage.
-
         :param name: The name of the metric.
         :param value: The numeric value of the metric.
         :param comparator: An instance of MetricComparator for comparison logic.
         :param is_worst: Indicates if the metric value is the worst possible value.
         """
-        # Store the metric value (dynamic, instance-specific)
+        self.name = name
         self.value = value
+        self.comparator = comparator
         self.is_worst = is_worst or value is None
 
-        # Get or create a shared comparator instance
-        if comparator is not None:
-            # Use the shared comparator cache to ensure we reuse identical comparators
-            # This is the key optimization: identical comparators are shared across all metrics
-            shared_comparator = _get_shared_comparator(
-                comparison_method=comparator.comparison_method, target=comparator.target, epsilon=comparator.epsilon
-            )
-        else:
-            # If no comparator provided, raise an error as it's required for a valid metric
-            # This maintains the same behavior as before
-            raise ValueError("Metric requires a comparator. Provide a MetricComparator instance.")
-
-        # Create internal metric definition (separates type from value)
-        # This is the key separation: definition (what it is) vs value (measurement)
-        self._definition = _MetricDefinition(name=name, comparator=shared_comparator)
-
-    @property
-    def name(self) -> str:
-        """The name of the metric (for backward compatibility)."""
-        return self._definition.name
-
-    @property
-    def comparator(self) -> MetricComparator:
-        """The shared comparator instance (for backward compatibility)."""
-        return self._definition.comparator
-
     def __gt__(self, other) -> bool:
         """
         Determine if this metric is better than another metric.
@@ -245,24 +135,23 @@ def __gt__(self, other) -> bool:
         if not isinstance(other, Metric):
             return NotImplemented
 
-        if self.is_worst:
+        if self.is_worst or (self.is_worst and other.is_worst):
             return False
 
         if other.is_worst:
             return True
 
-        # Compare using definitions - this is cleaner and ensures consistency
-        if self._definition != other._definition:
-            # Provide detailed error message for backward compatibility
-            if self.name != other.name:
-                raise ValueError("Cannot compare metrics with different names.")
-            if self.comparator.comparison_method != other.comparator.comparison_method:
-                raise ValueError("Cannot compare metrics with different comparison methods.")
-            if (
-                self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER
-                and self.comparator.target != other.comparator.target
-            ):
-                raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.")
+        if self.name != other.name:
+            raise ValueError("Cannot compare metrics with different names.")
+
+        if self.comparator.comparison_method != other.comparator.comparison_method:
+            raise ValueError("Cannot compare metrics with different comparison methods.")
+
+        if (
+            self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER
+            and self.comparator.target != other.comparator.target
+        ):
+            raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.")
 
         return self.comparator.compare(self.value, other.value) < 0
 
@@ -282,8 +171,11 @@ def __eq__(self, other) -> bool:
         if self.is_worst or other.is_worst:
             return False
 
-        # Use definition equality for cleaner comparison
-        return self._definition == other._definition and self.comparator.compare(self.value, other.value) == 0
+        return (
+            self.name == other.name
+            and self.comparator.comparison_method == other.comparator.comparison_method
+            and self.comparator.compare(self.value, other.value) == 0
+        )
 
     def __repr__(self) -> str:
         """
@@ -319,4 +211,4 @@ def is_valid(self) -> bool:
 
         :return: True if the metric value is valid, False otherwise.
         """
-        return self.value is not None and not (self.value != self.value)  # NaN check
+        return self.value is not None and not (self.value != self.value)  # NaN check
\ No newline at end of file
diff --git a/plexe/tools/datasets.py b/plexe/tools/datasets.py
index 7f67487c..b72f9df6 100644
--- a/plexe/tools/datasets.py
+++ b/plexe/tools/datasets.py
@@ -8,15 +8,18 @@
 """
 
 import logging
+import json
 from datetime import datetime
 from typing import Dict, List, Any
 
 import numpy as np
 import pandas as pd
+from pydantic import BaseModel, ValidationError
 from smolagents import tool
 
 from plexe.internal.common.datasets.adapter import DatasetAdapter
 from plexe.internal.common.datasets.interface import TabularConvertible
+from plexe.internal.common.provider import Provider
 from plexe.core.object_registry import ObjectRegistry
 from plexe.internal.models.entities.code import Code
 
@@ -79,7 +82,7 @@ def register_split_datasets(
     dataset_sizes["validation"].append(len(val_ds))
     dataset_sizes["test"].append(len(test_ds))
 
-    logger.debug(
+    logger.debug(  
         f"✅ Registered custom split of dataset {dataset_name} into train/validation/test with sizes "
         f"{len(train_ds)}/{len(val_ds)}/{len(test_ds)}"
     )
@@ -109,35 +112,78 @@ def create_input_sample(n_samples: int = 5) -> bool:
     input_schema = object_registry.get(dict, "input_schema")
 
     try:
-        # Create synthetic sample data that matches the schema
+        # Create a Pydantic model dynamically from the schema for validation
+        schema_fields = {}
+        for field_name, field_type in input_schema.items():
+            # Map string type names to Python types
+            type_mapping = {
+                "int": int,
+                "float": float,
+                "bool": bool,
+                "str": str,
+                "List[int]": List[int],
+                "List[float]": List[float],
+                "List[bool]": List[bool],
+                "List[str]": List[str],
+            }
+            python_type = type_mapping.get(field_type, Any)
+            schema_fields[field_name] = (python_type, ...)
+
+        SampleModel = type("SampleModel", (BaseModel,), schema_fields)
+
+        # Use LLM to generate sensible sample values based on field names and types
+        provider = Provider("openai/gpt-4o-mini")
         input_sample_dicts = []
 
-        # Generate synthetic examples
+        # Generate samples using LLM
         for i in range(n_samples):
-            sample = {}
-            for field_name, field_type in input_schema.items():
-                # Generate appropriate sample values based on type
-                if field_type == "int":
-                    sample[field_name] = i * 10
-                elif field_type == "float":
-                    sample[field_name] = i * 10.5
-                elif field_type == "bool":
-                    sample[field_name] = i % 2 == 0
-                elif field_type == "str":
-                    sample[field_name] = f"sample_{field_name}_{i}"
-                elif field_type == "List[int]":
-                    sample[field_name] = [i * 10, i * 20, i * 30]
-                elif field_type == "List[float]":
-                    sample[field_name] = [i * 10.5, i * 20.5, i * 30.5]
-                elif field_type == "List[bool]":
-                    sample[field_name] = [True, False, i % 2 == 0]
-                elif field_type == "List[str]":
-                    sample[field_name] = [f"item_{i}_1", f"item_{i}_2", f"item_{i}_3"]
-                else:
-                    sample[field_name] = None
-            input_sample_dicts.append(sample)
-
-        # TODO: we should use an LLM call to generate sensible values; then validate using pydantic
+            # Create prompt for LLM to generate realistic sample
+            schema_description = ", ".join([f"{name}: {type_str}" for name, type_str in input_schema.items()])
+            user_prompt = f"""Generate a realistic sample data point for a machine learning model with the following schema:
+{schema_description}
+
+Generate realistic, sensible values that make sense for each field name. For example:
+- If field is "age", use a realistic age like 25 or 45, not 1000 or -5
+- If field is "email", use a realistic email format like "user@example.com"
+- If field is "price", use a realistic price value
+- Make values diverse across different samples"""
+
+            try:
+                # Get LLM response with Pydantic validation
+                response_str = provider.query(
+                    system_message="You are a data generation assistant. Generate realistic sample data that matches the given schema.",
+                    user_message=user_prompt,
+                    response_format=SampleModel,
+                )
+
+                # Parse and validate the response using Pydantic
+                response_dict = json.loads(response_str) if isinstance(response_str, str) else response_str
+                validated_sample = SampleModel(**response_dict)
+                input_sample_dicts.append(validated_sample.model_dump())
+            except (json.JSONDecodeError, ValidationError, Exception) as e:
+                # Fallback to simple synthetic generation if LLM fails
+                logger.warning(f"LLM generation failed for sample {i}, using fallback: {str(e)}")
+                sample = {}
+                for field_name, field_type in input_schema.items():
+                    if field_type == "int":
+                        sample[field_name] = i * 10
+                    elif field_type == "float":
+                        sample[field_name] = i * 10.5
+                    elif field_type == "bool":
+                        sample[field_name] = i % 2 == 0
+                    elif field_type == "str":
+                        sample[field_name] = f"sample_{field_name}_{i}"
+                    elif field_type == "List[int]":
+                        sample[field_name] = [i * 10, i * 20, i * 30]
+                    elif field_type == "List[float]":
+                        sample[field_name] = [i * 10.5, i * 20.5, i * 30.5]
+                    elif field_type == "List[bool]":
+                        sample[field_name] = [True, False, i % 2 == 0]
+                    elif field_type == "List[str]":
+                        sample[field_name] = [f"item_{i}_1", f"item_{i}_2", f"item_{i}_3"]
+                    else:
+                        sample[field_name] = None
+                input_sample_dicts.append(sample)
 
         # Register the input sample in the registry for validation tool to use
         object_registry.register(list, "predictor_input_sample", input_sample_dicts, overwrite=True, immutable=True)

From cbcee7c9559d2434ed59cdb5d8fea69f109cf9ce Mon Sep 17 00:00:00 2001
From: aumpatel <op88814@gmail.com>
Date: Tue, 2 Dec 2025 17:19:56 -0500
Subject: [PATCH 5/5] style: format metric.py and datasets.py with black

- Fix black formatting issues for CI compliance
- Format both files to match project style standards
---
 plexe/internal/models/entities/metric.py | 2 +-
 plexe/tools/datasets.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py
index 4978bab5..f5355984 100644
--- a/plexe/internal/models/entities/metric.py
+++ b/plexe/internal/models/entities/metric.py
@@ -211,4 +211,4 @@ def is_valid(self) -> bool:
 
         :return: True if the metric value is valid, False otherwise.
         """
-        return self.value is not None and not (self.value != self.value)  # NaN check
\ No newline at end of file
+        return self.value is not None and not (self.value != self.value)  # NaN check
diff --git a/plexe/tools/datasets.py b/plexe/tools/datasets.py
index b72f9df6..5e906062 100644
--- a/plexe/tools/datasets.py
+++ b/plexe/tools/datasets.py
@@ -82,7 +82,7 @@ def register_split_datasets(
     dataset_sizes["validation"].append(len(val_ds))
     dataset_sizes["test"].append(len(test_ds))
 
-    logger.debug(  
+    logger.debug(
         f"✅ Registered custom split of dataset {dataset_name} into train/validation/test with sizes "
         f"{len(train_ds)}/{len(val_ds)}/{len(test_ds)}"
     )