From 7529e83a8a09b7bed579bd14ade6de92a6a0bab5 Mon Sep 17 00:00:00 2001 From: aumpatel Date: Fri, 28 Nov 2025 18:45:13 -0500 Subject: [PATCH 1/5] refactor: separate metric definition from metric value and share comparator instances - Add _MetricDefinition class to separate metric type (definition) from metric value (measurement) - Implement comparator caching via _get_shared_comparator() to ensure only one comparator object exists per unique (method, target, epsilon) combination - All solutions using the same comparison method now share the same comparator instance, reducing memory usage - Maintain full backward compatibility - no changes to Metric class API - Remove TODO comment as the refactoring addresses the concern about mixing metric and metric value concerns --- plexe/internal/models/entities/metric.py | 144 ++++++++++++++++++++--- 1 file changed, 127 insertions(+), 17 deletions(-) diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py index f5355984..078954e9 100644 --- a/plexe/internal/models/entities/metric.py +++ b/plexe/internal/models/entities/metric.py @@ -22,6 +22,8 @@ from enum import Enum from functools import total_ordering +from typing import Optional +from weakref import WeakValueDictionary class ComparisonMethod(Enum): @@ -98,31 +100,138 @@ def compare(self, value1: float, value2: float) -> int: raise ValueError("Invalid comparison method.") -# todo: this class is a mess as it mixes concerns of a metric and a metric value; needs refactoring +# Internal cache for sharing MetricComparator instances across all metrics +# This ensures only one comparator object exists per unique (method, target, epsilon) combination +_comparator_cache: WeakValueDictionary = WeakValueDictionary() + + +def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9) -> MetricComparator: + """ + Get or create a shared MetricComparator instance. + + This function ensures that identical comparators are reused across all Metric instances, + reducing memory usage and ensuring consistency. + + :param comparison_method: The comparison method. + :param target: Optional target value for TARGET_IS_BETTER. + :param epsilon: Tolerance for floating-point comparisons. + :return: A shared MetricComparator instance. + """ + # Create a cache key from the comparator parameters + cache_key = (comparison_method, target, epsilon) + + # Try to get existing comparator from cache + if cache_key in _comparator_cache: + return _comparator_cache[cache_key] + + # Create new comparator and cache it + comparator = MetricComparator(comparison_method, target, epsilon) + _comparator_cache[cache_key] = comparator + return comparator + + +class _MetricDefinition: + """ + Internal class representing a metric type definition. + + This separates the metric definition (what it is) from the metric value (a measurement). + Metric definitions are immutable and can be shared across multiple metric values. + + This is an internal implementation detail - users should not interact with this class directly. + """ + + def __init__(self, name: str, comparator: MetricComparator): + """ + Initialize a metric definition. + + :param name: The name of the metric. + :param comparator: The shared comparator instance. + """ + self._name = name + self._comparator = comparator + + @property + def name(self) -> str: + """The name of the metric.""" + return self._name + + @property + def comparator(self) -> MetricComparator: + """The shared comparator instance.""" + return self._comparator + + def __eq__(self, other) -> bool: + """Check if two metric definitions are equal.""" + if not isinstance(other, _MetricDefinition): + return False + return ( + self.name == other.name + and self.comparator.comparison_method == other.comparator.comparison_method + and self.comparator.target == other.comparator.target + ) + + def __hash__(self) -> int: + """Hash the metric definition.""" + return hash((self.name, self.comparator.comparison_method, self.comparator.target)) + + @total_ordering class Metric: """ Represents a metric with a name, a value, and a comparator for determining which metric is better. + This class internally separates the metric definition (type) from the metric value (measurement), + and automatically shares comparator instances to reduce memory usage. + Attributes: name (str): The name of the metric (e.g., 'accuracy', 'loss'). value (float): The numeric value of the metric. - comparator (MetricComparator): The comparison logic for the metric. + comparator (MetricComparator): The comparison logic for the metric (shared instance). """ def __init__(self, name: str, value: float = None, comparator: MetricComparator = None, is_worst: bool = False): """ Initializes a Metric object. + The comparator instance is automatically shared with other metrics that have the same + comparison method, target, and epsilon values, reducing memory usage. + :param name: The name of the metric. :param value: The numeric value of the metric. :param comparator: An instance of MetricComparator for comparison logic. :param is_worst: Indicates if the metric value is the worst possible value. """ - self.name = name + # Store the metric value (dynamic, instance-specific) self.value = value - self.comparator = comparator self.is_worst = is_worst or value is None + + # Get or create a shared comparator instance + if comparator is not None: + # Use the shared comparator cache to ensure we reuse identical comparators + # This is the key optimization: identical comparators are shared across all metrics + shared_comparator = _get_shared_comparator( + comparison_method=comparator.comparison_method, + target=comparator.target, + epsilon=comparator.epsilon + ) + else: + # If no comparator provided, raise an error as it's required for a valid metric + # This maintains the same behavior as before + raise ValueError("Metric requires a comparator. Provide a MetricComparator instance.") + + # Create internal metric definition (separates type from value) + # This is the key separation: definition (what it is) vs value (measurement) + self._definition = _MetricDefinition(name=name, comparator=shared_comparator) + + @property + def name(self) -> str: + """The name of the metric (for backward compatibility).""" + return self._definition.name + + @property + def comparator(self) -> MetricComparator: + """The shared comparator instance (for backward compatibility).""" + return self._definition.comparator def __gt__(self, other) -> bool: """ @@ -141,17 +250,18 @@ def __gt__(self, other) -> bool: if other.is_worst: return True - if self.name != other.name: - raise ValueError("Cannot compare metrics with different names.") - - if self.comparator.comparison_method != other.comparator.comparison_method: - raise ValueError("Cannot compare metrics with different comparison methods.") - - if ( - self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER - and self.comparator.target != other.comparator.target - ): - raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.") + # Compare using definitions - this is cleaner and ensures consistency + if self._definition != other._definition: + # Provide detailed error message for backward compatibility + if self.name != other.name: + raise ValueError("Cannot compare metrics with different names.") + if self.comparator.comparison_method != other.comparator.comparison_method: + raise ValueError("Cannot compare metrics with different comparison methods.") + if ( + self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER + and self.comparator.target != other.comparator.target + ): + raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.") return self.comparator.compare(self.value, other.value) < 0 @@ -171,9 +281,9 @@ def __eq__(self, other) -> bool: if self.is_worst or other.is_worst: return False + # Use definition equality for cleaner comparison return ( - self.name == other.name - and self.comparator.comparison_method == other.comparator.comparison_method + self._definition == other._definition and self.comparator.compare(self.value, other.value) == 0 ) From f413dd3405c609c3553bc6ba3db913f00dd9ba7b Mon Sep 17 00:00:00 2001 From: aumpatel Date: Fri, 28 Nov 2025 18:55:00 -0500 Subject: [PATCH 2/5] fix: include epsilon in MetricDefinition equality and hash methods - Add epsilon to __eq__ method to ensure metrics with different epsilon values are correctly differentiated - Add epsilon to __hash__ method to maintain hash contract (must include all fields used in __eq__) - Fix redundant condition in Metric.__gt__ method This fixes critical bugs identified in code review that could cause incorrect metric comparisons. --- plexe/internal/models/entities/metric.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py index 078954e9..0478b6db 100644 --- a/plexe/internal/models/entities/metric.py +++ b/plexe/internal/models/entities/metric.py @@ -168,11 +168,12 @@ def __eq__(self, other) -> bool: self.name == other.name and self.comparator.comparison_method == other.comparator.comparison_method and self.comparator.target == other.comparator.target + and self.comparator.epsilon == other.comparator.epsilon ) def __hash__(self) -> int: """Hash the metric definition.""" - return hash((self.name, self.comparator.comparison_method, self.comparator.target)) + return hash((self.name, self.comparator.comparison_method, self.comparator.target, self.comparator.epsilon)) @total_ordering @@ -244,7 +245,7 @@ def __gt__(self, other) -> bool: if not isinstance(other, Metric): return NotImplemented - if self.is_worst or (self.is_worst and other.is_worst): + if self.is_worst: return False if other.is_worst: From ac5284323911b43066ef2177e2f112aaf5ecf429 Mon Sep 17 00:00:00 2001 From: aumpatel Date: Fri, 28 Nov 2025 19:03:11 -0500 Subject: [PATCH 3/5] style: format metric.py with black - Reformat code to comply with black formatting standards - Fixes CI formatting check failure --- plexe/internal/models/entities/metric.py | 45 +++++++++++------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py index 0478b6db..ce313f05 100644 --- a/plexe/internal/models/entities/metric.py +++ b/plexe/internal/models/entities/metric.py @@ -105,13 +105,15 @@ def compare(self, value1: float, value2: float) -> int: _comparator_cache: WeakValueDictionary = WeakValueDictionary() -def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9) -> MetricComparator: +def _get_shared_comparator( + comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9 +) -> MetricComparator: """ Get or create a shared MetricComparator instance. - + This function ensures that identical comparators are reused across all Metric instances, reducing memory usage and ensuring consistency. - + :param comparison_method: The comparison method. :param target: Optional target value for TARGET_IS_BETTER. :param epsilon: Tolerance for floating-point comparisons. @@ -119,11 +121,11 @@ def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional """ # Create a cache key from the comparator parameters cache_key = (comparison_method, target, epsilon) - + # Try to get existing comparator from cache if cache_key in _comparator_cache: return _comparator_cache[cache_key] - + # Create new comparator and cache it comparator = MetricComparator(comparison_method, target, epsilon) _comparator_cache[cache_key] = comparator @@ -133,33 +135,33 @@ def _get_shared_comparator(comparison_method: ComparisonMethod, target: Optional class _MetricDefinition: """ Internal class representing a metric type definition. - + This separates the metric definition (what it is) from the metric value (a measurement). Metric definitions are immutable and can be shared across multiple metric values. - + This is an internal implementation detail - users should not interact with this class directly. """ - + def __init__(self, name: str, comparator: MetricComparator): """ Initialize a metric definition. - + :param name: The name of the metric. :param comparator: The shared comparator instance. """ self._name = name self._comparator = comparator - + @property def name(self) -> str: """The name of the metric.""" return self._name - + @property def comparator(self) -> MetricComparator: """The shared comparator instance.""" return self._comparator - + def __eq__(self, other) -> bool: """Check if two metric definitions are equal.""" if not isinstance(other, _MetricDefinition): @@ -170,7 +172,7 @@ def __eq__(self, other) -> bool: and self.comparator.target == other.comparator.target and self.comparator.epsilon == other.comparator.epsilon ) - + def __hash__(self) -> int: """Hash the metric definition.""" return hash((self.name, self.comparator.comparison_method, self.comparator.target, self.comparator.epsilon)) @@ -205,30 +207,28 @@ def __init__(self, name: str, value: float = None, comparator: MetricComparator # Store the metric value (dynamic, instance-specific) self.value = value self.is_worst = is_worst or value is None - + # Get or create a shared comparator instance if comparator is not None: # Use the shared comparator cache to ensure we reuse identical comparators # This is the key optimization: identical comparators are shared across all metrics shared_comparator = _get_shared_comparator( - comparison_method=comparator.comparison_method, - target=comparator.target, - epsilon=comparator.epsilon + comparison_method=comparator.comparison_method, target=comparator.target, epsilon=comparator.epsilon ) else: # If no comparator provided, raise an error as it's required for a valid metric # This maintains the same behavior as before raise ValueError("Metric requires a comparator. Provide a MetricComparator instance.") - + # Create internal metric definition (separates type from value) # This is the key separation: definition (what it is) vs value (measurement) self._definition = _MetricDefinition(name=name, comparator=shared_comparator) - + @property def name(self) -> str: """The name of the metric (for backward compatibility).""" return self._definition.name - + @property def comparator(self) -> MetricComparator: """The shared comparator instance (for backward compatibility).""" @@ -283,10 +283,7 @@ def __eq__(self, other) -> bool: return False # Use definition equality for cleaner comparison - return ( - self._definition == other._definition - and self.comparator.compare(self.value, other.value) == 0 - ) + return self._definition == other._definition and self.comparator.compare(self.value, other.value) == 0 def __repr__(self) -> str: """ From 05e63dbaa44e247f6cbf8337f1a745666f723d50 Mon Sep 17 00:00:00 2001 From: aumpatel Date: Tue, 2 Dec 2025 17:10:25 -0500 Subject: [PATCH 4/5] Generate Sample Data using LLM --- CODEBASE_STRUCTURE.md | 446 ++++++++++++++ COMPLETE_MODEL_BUILDING_WALKTHROUGH.md | 703 +++++++++++++++++++++++ plexe/internal/models/entities/metric.py | 152 +---- plexe/tools/datasets.py | 100 +++- 4 files changed, 1244 insertions(+), 157 deletions(-) create mode 100644 CODEBASE_STRUCTURE.md create mode 100644 COMPLETE_MODEL_BUILDING_WALKTHROUGH.md diff --git a/CODEBASE_STRUCTURE.md b/CODEBASE_STRUCTURE.md new file mode 100644 index 00000000..1c959314 --- /dev/null +++ b/CODEBASE_STRUCTURE.md @@ -0,0 +1,446 @@ +# Plexe Codebase Structure - Complete A-Z Documentation + +## Overview +Plexe is an agentic framework for building ML models from natural language. It uses a multi-agent system where specialized AI agents collaborate to analyze data, plan solutions, train models, and generate production-ready inference code. + +## Architecture Summary +- **Entry Points**: `main.py` (CLI), `server.py` (Web UI), `model_builder.py` (Core API) +- **Core System**: Object registry, state management, storage, interfaces +- **Agents**: 8 specialized agents for different ML tasks +- **Tools**: 14 tool modules providing agent capabilities +- **Internal**: Common utilities, models, datasets, schemas +- **Testing**: Unit, integration, and benchmark tests + +--- + +## A-Z File Structure and Dependencies + +### **A - Agents** (`plexe/agents/`) +**Purpose**: Multi-agent system for ML model building + +#### `agents.py` - Main Orchestrator +- **What it does**: Coordinates 8 specialized agents in ML workflow +- **Key classes**: `PlexeAgent`, `ModelGenerationResult` +- **Dependencies**: All other agent modules, tools, core modules +- **Flow**: Creates agents → Runs orchestrator → Extracts results + +#### `conversational.py` - User Interface Agent +- **What it does**: Guides users through ML requirements via chat +- **Key classes**: `ConversationalAgent` +- **Dependencies**: `tools.conversation`, `tools.datasets` +- **Flow**: User input → Validation → Model build initiation + +#### `dataset_analyser.py` - EDA Agent +- **What it does**: Performs exploratory data analysis on datasets +- **Key classes**: `EdaAgent` +- **Dependencies**: `tools.datasets`, `tools.schemas` +- **Flow**: Dataset → Analysis → EDA report registration + +#### `dataset_splitter.py` - Data Splitting Agent +- **What it does**: Intelligently splits datasets into train/val/test +- **Key classes**: `DatasetSplitterAgent` +- **Dependencies**: `tools.datasets` +- **Flow**: Dataset → Split strategy → Registered splits + +#### `feature_engineer.py` - Feature Engineering Agent +- **What it does**: Transforms raw data into optimized features +- **Key classes**: `FeatureEngineeringAgent` +- **Dependencies**: `tools.datasets`, `tools.execution`, `tools.validation` +- **Flow**: Raw data → Transformations → Enhanced datasets + +#### `model_packager.py` - MLOps Agent +- **What it does**: Creates production-ready inference code +- **Key classes**: `ModelPackagerAgent` +- **Dependencies**: `tools.context`, `tools.validation`, `tools.solutions` +- **Flow**: Training code → Inference code → Production package + +#### `model_planner.py` - ML Research Agent +- **What it does**: Plans ML approaches and solution strategies +- **Key classes**: `ModelPlannerAgent` +- **Dependencies**: `tools.datasets`, `tools.schemas`, `tools.solutions` +- **Flow**: Requirements → Analysis → Solution plans + +#### `model_trainer.py` - ML Engineer Agent +- **What it does**: Implements and executes training code +- **Key classes**: `ModelTrainerAgent` +- **Dependencies**: `tools.execution`, `tools.training`, `tools.validation` +- **Flow**: Plan → Code generation → Training → Validation + +#### `model_tester.py` - Model Testing Agent +- **What it does**: Tests and evaluates finalized models +- **Key classes**: `ModelTesterAgent` +- **Dependencies**: `tools.testing`, `tools.datasets`, `tools.schemas` +- **Flow**: Model → Testing → Evaluation report + +#### `schema_resolver.py` - Schema Resolution Agent +- **What it does**: Resolves and validates input/output schemas +- **Key classes**: `SchemaResolverAgent` +- **Dependencies**: `tools.schemas` +- **Flow**: Data analysis → Schema inference → Validation + +--- + +### **B - Core System** (`plexe/core/`) +**Purpose**: Fundamental types and functionality used across Plexe + +#### `state.py` - Model State Management +- **What it does**: Defines model lifecycle states +- **Key classes**: `ModelState` (Enum) +- **States**: DRAFT, BUILDING, READY, ERROR +- **Dependencies**: None (base enum) + +#### `storage.py` - Model Persistence +- **What it does**: Core implementation for saving/loading models and checkpoints +- **Key functions**: `_save_model_to_tar()`, `_load_model_data_from_tar()`, `_save_checkpoint_to_tar()` +- **Dependencies**: `core.state`, `config`, `internal.common.utils.pydantic_utils` +- **Flow**: Model data → Tar archive → File system + +#### `object_registry.py` - Global Object Registry +- **What it does**: Singleton registry for storing/retrieving objects by type and name +- **Key classes**: `ObjectRegistry`, `Item` +- **Dependencies**: None (core functionality) +- **Flow**: Register objects → Retrieve by type/name → Manage lifecycle + +#### `entities/solution.py` - Solution Container +- **What it does**: Represents complete ML solutions from planning to deployment +- **Key classes**: `Solution` +- **Dependencies**: `internal.models.entities.artifact`, `internal.models.entities.metric` +- **Flow**: Plan → Code → Execution → Results → Artifacts + +#### `interfaces/` - Core Interfaces +- **`predictor.py`**: Abstract base for inference code (`Predictor`) +- **`feature_transformer.py`**: Abstract base for feature transformers (`FeatureTransformer`) +- **Dependencies**: `internal.models.entities.artifact`, `pandas` +- **Purpose**: Define contracts for generated code + +--- + +### **C - Configuration** (`plexe/config.py`) +**Purpose**: Central configuration management + +- **What it does**: Loads configuration from YAML files and environment variables +- **Key components**: `config` object, `prompt_templates` object +- **Dependencies**: `yaml`, `pathlib` +- **Flow**: YAML files → Configuration object → Application settings + +--- + +### **D - Datasets** (`plexe/datasets.py`) +**Purpose**: Dataset generation and management + +- **What it does**: Provides `DatasetGenerator` class for synthetic data creation +- **Key classes**: `DatasetGenerator` +- **Dependencies**: `pandas`, `numpy`, `internal.common.datasets.interface` +- **Flow**: Schema → Synthetic data → TabularConvertible dataset + +--- + +### **E - Entry Points** +**Purpose**: Application entry points and main interfaces + +#### `main.py` - CLI Entry Point +- **What it does**: Launches Plexe assistant with web UI +- **Dependencies**: `server`, `uvicorn`, `webbrowser` +- **Flow**: Start server → Open browser → Keep running + +#### `server.py` - Web Server +- **What it does**: FastAPI server for conversational agent +- **Dependencies**: `agents.conversational`, `fastapi`, `websockets` +- **Flow**: WebSocket connection → Agent processing → Response + +#### `model_builder.py` - Core API +- **What it does**: Main API for building ML models programmatically +- **Key classes**: `ModelBuilder` +- **Dependencies**: All agents, core modules, tools, callbacks +- **Flow**: Intent + Data → Agent orchestration → Complete model + +--- + +### **F - File I/O** (`plexe/fileio.py`) +**Purpose**: High-level file operations for models and checkpoints + +- **What it does**: Provides user-friendly functions for model persistence +- **Key functions**: `save_model()`, `load_model()`, `save_checkpoint()`, `load_checkpoint()` +- **Dependencies**: `core.storage`, `models` +- **Flow**: Model object → Storage functions → File system + +--- + +### **G - Internal Common** (`plexe/internal/common/`) +**Purpose**: Shared utilities and common functionality + +#### `provider.py` - LLM Provider Management +- **What it does**: Manages different LLM providers with retry logic +- **Key classes**: `ProviderConfig`, `Provider` +- **Dependencies**: `litellm`, `tenacity`, `pydantic` +- **Flow**: Provider config → API calls → Response with retries + +#### `datasets/` - Dataset Interfaces and Adapters +- **`interface.py`**: Core dataset interfaces (`Dataset`, `TabularConvertible`, etc.) +- **`adapter.py`**: Adapter for converting between dataset types +- **`tabular.py`**: Tabular dataset implementation +- **Dependencies**: `pandas`, `numpy`, `abc` +- **Flow**: Data → Interface implementation → Convertible formats + +#### `utils/` - Utility Functions +- **`agents.py`**: Agent utility functions +- **`chain_of_thought/`**: Chain of thought logging utilities +- **`dataset_storage.py`**: Dataset storage utilities +- **`dependency_utils.py`**: Dependency management +- **`markdown_utils.py`**: Markdown formatting +- **`model_state.py`**: Model state utilities +- **`model_utils.py`**: Model utility functions +- **`pandas_utils.py`**: Pandas utilities +- **`prompt_utils.py`**: Prompt template utilities +- **`pydantic_utils.py`**: Pydantic utilities +- **`response.py`**: Response formatting utilities + +--- + +### **H - Internal Models** (`plexe/internal/models/`) +**Purpose**: Internal model entities and execution logic + +#### `entities/` - Model Entities +- **`artifact.py`**: Represents model artifacts (files, data, handles) +- **`code.py`**: Represents code objects with performance metrics +- **`description.py`**: Model description entities +- **`metric.py`**: Metric comparison and evaluation logic +- **Dependencies**: `pathlib`, `io`, `dataclasses` + +#### `execution/` - Code Execution +- **`executor.py`**: Base executor interface +- **`process_executor.py`**: Process-based execution +- **`docker_executor.py`**: Docker-based execution +- **`ray_executor.py`**: Ray-based distributed execution +- **Dependencies**: `subprocess`, `docker`, `ray` + +#### `generation/` - Code Generation +- **`planning.py`**: Solution planning logic +- **`review.py`**: Code review functionality +- **`training.py`**: Training code generation +- **Dependencies**: `internal.common.provider`, `internal.common.utils` + +#### `validation/` - Code Validation +- **`validator.py`**: Main validation orchestrator +- **`composite.py`**: Composite validators +- **`primitives/`**: Basic validation primitives +- **Dependencies**: `ast`, `importlib`, `subprocess` + +#### `callbacks/` - Execution Callbacks +- **`chain_of_thought.py`**: Chain of thought logging +- **`checkpoint.py`**: Model checkpointing +- **`mlflow.py`**: MLflow integration +- **Dependencies**: `mlflow`, `plexe.core.object_registry` + +--- + +### **I - Internal Datasets** (`plexe/internal/datasets/`) +**Purpose**: Dataset generation and validation + +#### `core/` - Core Dataset Logic +- **`generation/`**: Dataset generation utilities +- **`validation/`**: Dataset validation logic +- **Dependencies**: `pandas`, `numpy`, `sklearn` + +#### `generator.py` - Dataset Generator +- **What it does**: Main dataset generation interface +- **Dependencies**: `core.generation`, `core.validation` + +#### `config.py` - Dataset Configuration +- **What it does**: Dataset generation configuration +- **Dependencies**: `pydantic` + +--- + +### **J - Internal Schemas** (`plexe/internal/schemas/`) +**Purpose**: Schema resolution and management + +#### `resolver.py` - Schema Resolver +- **What it does**: Resolves and validates schemas from data +- **Dependencies**: `pandas`, `pydantic`, `internal.common.utils` + +--- + +### **K - Models** (`plexe/models.py`) +**Purpose**: Main Model class and related functionality + +- **What it does**: Defines the main `Model` class representing a complete ML model +- **Key classes**: `Model` +- **Dependencies**: `core.state`, `core.interfaces`, `internal.models.entities` +- **Flow**: Model creation → Training → Inference → Persistence + +--- + +### **L - Tools** (`plexe/tools/`) +**Purpose**: Tool functions used by agents + +#### `code_analysis.py` - Code Analysis Tools +- **What it does**: Analyzes and extracts code from solutions +- **Dependencies**: `core.object_registry`, `internal.models.entities.code` + +#### `context.py` - Context Tools +- **What it does**: Provides context for code generation +- **Dependencies**: `core.object_registry`, `internal.common.provider` + +#### `conversation.py` - Conversation Tools +- **What it does**: Tools for conversational agent +- **Dependencies**: `core.object_registry`, `model_builder` + +#### `datasets.py` - Dataset Tools +- **What it does**: Dataset manipulation and registration tools +- **Dependencies**: `core.object_registry`, `internal.common.datasets` + +#### `evaluation.py` - Evaluation Tools +- **What it does**: Model evaluation and review tools +- **Dependencies**: `core.object_registry`, `internal.common.provider` + +#### `execution.py` - Execution Tools +- **What it does**: Code execution and training tools +- **Dependencies**: `core.object_registry`, `internal.models.execution` + +#### `metrics.py` - Metrics Tools +- **What it does**: Metric selection and comparison tools +- **Dependencies**: `core.object_registry`, `internal.common.provider` + +#### `response_formatting.py` - Response Formatting +- **What it does**: Formats agent responses +- **Dependencies**: `core.object_registry`, `internal.common.provider` + +#### `schemas.py` - Schema Tools +- **What it does**: Schema inference and validation tools +- **Dependencies**: `core.object_registry`, `internal.common.utils` + +#### `solutions.py` - Solution Management +- **What it does**: Solution creation and management tools +- **Dependencies**: `core.object_registry`, `core.entities.solution` + +#### `testing.py` - Testing Tools +- **What it does**: Model testing and evaluation tools +- **Dependencies**: `core.object_registry`, `internal.common.provider` + +#### `training.py` - Training Tools +- **What it does**: Training code generation and management +- **Dependencies**: `core.object_registry`, `internal.common.provider` + +#### `validation.py` - Validation Tools +- **What it does**: Code and model validation tools +- **Dependencies**: `core.object_registry`, `internal.models.validation` + +--- + +### **M - Templates** (`plexe/templates/`) +**Purpose**: Jinja2 templates for prompts and code generation + +#### `models/` - Model Templates +- **`feature_transformer.tmpl.py`**: Feature transformer code template +- **`predictor.tmpl.py`**: Predictor code template + +#### `prompts/` - Prompt Templates +- **`agent/`**: Agent-specific prompt templates +- **`planning/`**: Planning prompt templates +- **`review/`**: Review prompt templates +- **`schemas/`**: Schema prompt templates +- **`training/`**: Training prompt templates +- **`utils/`**: Utility prompt templates + +--- + +### **N - UI** (`plexe/ui/`) +**Purpose**: Web user interface + +#### `index.html` - Web Interface +- **What it does**: HTML interface for conversational agent +- **Dependencies**: WebSocket connection to server +- **Flow**: User input → WebSocket → Agent → Response + +--- + +### **O - Callbacks** (`plexe/callbacks.py`) +**Purpose**: Callback system for model building lifecycle + +- **What it does**: Defines callback interface and implementations +- **Key classes**: `Callback`, `MLFlowCallback`, `ModelCheckpointCallback` +- **Dependencies**: `mlflow`, `core.object_registry` +- **Flow**: Model events → Callback execution → External systems + +--- + +### **P - Tests** (`tests/`) +**Purpose**: Comprehensive test suite + +#### `unit/` - Unit Tests +- **What it does**: Tests individual components in isolation +- **Coverage**: Core modules, internal utilities, models +- **Dependencies**: `pytest`, individual modules + +#### `integration/` - Integration Tests +- **What it does**: End-to-end tests for complete ML workflows +- **Coverage**: Binary classification, regression, time series, etc. +- **Dependencies**: `pytest`, full plexe system + +#### `benchmark/` - Benchmark Tests +- **What it does**: Performance benchmarks using MLE-Bench +- **Coverage**: Kaggle competitions, model performance +- **Dependencies**: `mle-bench`, `kaggle` + +#### `fixtures/` - Test Fixtures +- **What it does**: Test data and model artifacts +- **Contents**: Legacy model files for compatibility testing + +--- + +## Dependency Flow Diagram + +``` +Entry Points (main.py, server.py, model_builder.py) + ↓ +Core System (state.py, storage.py, object_registry.py) + ↓ +Agents (8 specialized agents) + ↓ +Tools (14 tool modules) + ↓ +Internal Modules (common, models, datasets, schemas) + ↓ +External Dependencies (pandas, sklearn, litellm, etc.) +``` + +## Key Dependencies + +### External Dependencies +- **pandas**: Data manipulation +- **scikit-learn**: Machine learning algorithms +- **litellm**: LLM provider abstraction +- **smolagents**: Agent framework +- **pydantic**: Data validation +- **fastapi**: Web server +- **mlflow**: Experiment tracking + +### Internal Dependencies +- **ObjectRegistry**: Central object storage +- **Provider**: LLM provider management +- **Solution**: ML solution container +- **Artifact**: Model artifact representation +- **Metric**: Performance metric handling + +## Data Flow + +1. **Input**: User intent + datasets +2. **Analysis**: EDA agent analyzes data +3. **Planning**: Research agent creates solution plans +4. **Feature Engineering**: Feature agent transforms data +5. **Splitting**: Splitter agent creates train/val/test sets +6. **Training**: Engineer agent implements and trains models +7. **Packaging**: Ops agent creates inference code +8. **Testing**: Tester agent evaluates final model +9. **Output**: Complete model with artifacts and code + +## Architecture Patterns + +- **Multi-Agent System**: Specialized agents for different tasks +- **Object Registry**: Centralized object storage and retrieval +- **Tool-Based Architecture**: Agents use tools for capabilities +- **Template System**: Jinja2 templates for code generation +- **Callback System**: Lifecycle event handling +- **Interface Segregation**: Clear contracts between components +- **Dependency Injection**: Loose coupling between modules diff --git a/COMPLETE_MODEL_BUILDING_WALKTHROUGH.md b/COMPLETE_MODEL_BUILDING_WALKTHROUGH.md new file mode 100644 index 00000000..88a402b1 --- /dev/null +++ b/COMPLETE_MODEL_BUILDING_WALKTHROUGH.md @@ -0,0 +1,703 @@ +# Complete Model Building Walkthrough: House Price Prediction Example + +This document provides a step-by-step walkthrough of how the Plexe system builds a machine learning model, tracing every file and code path from user input to final prediction. + +## Example Scenario +**User Goal**: Build a model to predict house prices based on features like bedrooms, bathrooms, and square footage. + +**User Code**: +```python +from plexe import ModelBuilder +import pandas as pd + +# Load dataset +df = pd.read_csv("houses.csv") # Contains: bedrooms, bathrooms, square_footage, price + +# Build model +builder = ModelBuilder(provider="openai/gpt-4o-mini", verbose=True) +model = builder.build( + intent="Given a dataset of house features, predict the house price.", + datasets=[df], + input_schema={"bedrooms": int, "bathrooms": int, "square_footage": float}, + output_schema={"price": float}, + max_iterations=3 +) + +# Make prediction +prediction = model.predict({"bedrooms": 3, "bathrooms": 2, "square_footage": 1500.0}) +print(prediction) # {"price": 250000.0} +``` + +--- + +## Phase 1: User Initialization + +### Step 1.1: User Creates ModelBuilder +**File**: `plexe/model_builder.py` (lines 37-56) + +**What Happens**: +- User calls `ModelBuilder(provider="openai/gpt-4o-mini", verbose=True)` +- `__init__` method executes: + - Creates `ProviderConfig` from provider string (line 53) + - Sets `self.verbose = True` + - Sets `self.distributed = False` + - Calls `_create_working_dir()` which creates `./workdir/run-{timestamp}/` directory + - Working directory path stored in `self.working_dir` + +**Output**: ModelBuilder instance ready with provider config and working directory + +--- + +### Step 1.2: User Calls build() Method +**File**: `plexe/model_builder.py` (lines 66-254) + +**What Happens**: +1. **Registry Initialization** (lines 95-97): + - Gets singleton `ObjectRegistry()` instance + - Clears registry: `object_registry.clear()` + - Registry is now empty and ready + +2. **Parameter Validation** (lines 99-103): + - Checks `timeout` or `max_iterations` is set + - Validates `run_timeout <= timeout` if both provided + +3. **Schema Processing** (lines 105-110): + - Calls `map_to_basemodel("in", input_schema)` from `plexe/internal/common/utils/pydantic_utils.py` + - **File**: `pydantic_utils.py` (lines 51-102): + - Converts dict `{"bedrooms": int, ...}` to Pydantic `BaseModel` class + - Creates model named "in" with fields: bedrooms (int), bathrooms (int), square_footage (float) + - Validates types are allowed (int, float, str, bool, List variants) + - Same process for `output_schema` → creates "out" model with `price: float` + - Registers schema locks in registry: + - `object_registry.register(bool, "input_schema_is_locked", True, immutable=True)` + - `object_registry.register(bool, "output_schema_is_locked", True, immutable=True)` + +4. **Callback Setup** (lines 112-122): + - Initializes callbacks list + - Adds `ModelCheckpointCallback` if `enable_checkpointing=True` + - Creates `ChainOfThoughtModelCallback` with `ConsoleEmitter` + - Registers all callbacks in ObjectRegistry + +5. **Dataset Registration** (lines 125-136): + - Converts pandas DataFrame to `DatasetAdapter` via `DatasetAdapter.coerce(df)` + - **File**: `plexe/internal/common/datasets/adapter.py` + - Creates `TabularDataset` wrapper around DataFrame + - Registers as `dataset_0` in ObjectRegistry with `immutable=True` + - Formats schemas using `format_schema()` and registers in registry: + - `object_registry.register(dict, "input_schema", {"bedrooms": "int", ...})` + - `object_registry.register(dict, "output_schema", {"price": "float"})` + +6. **Model Identifier Generation** (line 139): + - Creates unique ID: `model-{timestamp}` + +7. **Callback Notification - Build Start** (lines 141-151): + - Calls `_notify_callbacks()` with event="build_start" + - All registered callbacks receive `BuildStateInfo` with intent, schemas, datasets + +8. **Agent Creation** (lines 154-165): + - Creates `PlexeAgent` instance (this is the multi-agent orchestrator) + - **File**: `plexe/agents/agents.py` (lines 63-194) + +--- + +## Phase 2: Multi-Agent System Initialization + +### Step 2.1: PlexeAgent Initialization +**File**: `plexe/agents/agents.py` (lines 63-194) + +**What Happens**: +The `PlexeAgent.__init__` creates all specialist agents: + +1. **ML Research Agent** (lines 105-110): + - **File**: `plexe/agents/model_planner.py` + - Creates `ModelPlannerAgent` with `ml_researcher_model_id` + - This agent will plan ML solution approaches + +2. **Schema Resolver Agent** (lines 113-117): + - **File**: `plexe/agents/schema_resolver.py` + - Creates `SchemaResolverAgent` with `orchestrator_model_id` + - Will infer schemas if not provided (already provided in our case) + +3. **EDA Agent** (lines 120-124): + - **File**: `plexe/agents/dataset_analyser.py` + - Creates `EdaAgent` with `orchestrator_model_id` + - Will analyze dataset structure and generate insights + +4. **Feature Engineering Agent** (lines 127-131): + - **File**: `plexe/agents/feature_engineer.py` + - Creates `FeatureEngineeringAgent` with `ml_engineer_model_id` + - Will transform raw data into better features + +5. **Dataset Splitter Agent** (lines 134-138): + - **File**: `plexe/agents/dataset_splitter.py` + - Creates `DatasetSplitterAgent` with `orchestrator_model_id` + - Will split data into train/validation/test sets + +6. **Model Trainer Agent** (lines 141-148): + - **File**: `plexe/agents/model_trainer.py` + - Creates `ModelTrainerAgent` with `ml_engineer_model_id` and `tool_model_id` + - Will implement and execute training code + +7. **Model Packager Agent** (lines 151-157): + - **File**: `plexe/agents/model_packager.py` + - Creates `ModelPackagerAgent` with `ml_ops_engineer_model_id` + - Will generate inference/prediction code + +8. **Model Tester Agent** (lines 160-164): + - **File**: `plexe/agents/model_tester.py` + - Creates `ModelTesterAgent` with `ml_engineer_model_id` + - Will test and evaluate the final model + +9. **Orchestrator Agent** (lines 167-194): + - Creates `CodeAgent` (from smolagents library) named "Orchestrator" + - Configures with tools: + - `get_select_target_metric()` - selects evaluation metric + - `get_review_finalised_model()` - reviews final model + - `get_latest_datasets` - retrieves datasets from registry + - `get_solution_performances` - gets performance metrics + - `register_best_solution` - marks best solution + - `format_final_orchestrator_agent_response` - formats final response + - Registers all specialist agents as `managed_agents` + - Sets `max_steps=30`, `planning_interval=7` + +**Output**: Fully initialized multi-agent system with 8 specialist agents + 1 orchestrator + +--- + +### Step 2.2: Agent Prompt Generation +**File**: `plexe/model_builder.py` (lines 167-185) + +**What Happens**: +- Calls `prompt_templates.agent_builder_prompt()` with: + - intent: "Given a dataset of house features, predict the house price." + - input_schema: JSON string of schema dict + - output_schema: JSON string of schema dict + - datasets: ["dataset_0"] + - working_dir: "./workdir/run-{timestamp}/" + - max_iterations: 3 +- **File**: `plexe/config/prompt_templates.py` (or similar) +- Generates comprehensive prompt instructing orchestrator on the task +- Creates `additional_args` dict with all context + +--- + +## Phase 3: Orchestrator Agent Execution + +### Step 3.1: Orchestrator Starts Workflow +**File**: `plexe/agents/agents.py` (lines 196-204) + +**What Happens**: +- Calls `agent.run(agent_prompt, additional_args=additional_args)` +- Orchestrator (CodeAgent) begins executing steps +- Uses planning every 7 steps to decide next actions +- Can call tools or delegate to specialist agents + +**Typical Orchestrator Workflow**: +1. Select target metric (e.g., RMSE for regression) +2. Analyze dataset (delegate to EDA agent) +3. Resolve schemas if needed (delegate to SchemaResolver - skipped if provided) +4. Plan solutions (delegate to ML Research agent) +5. Split dataset (delegate to Dataset Splitter agent) +6. Train models (delegate to ML Engineer agent) +7. Package models (delegate to ML Ops agent) +8. Test models (delegate to Model Tester agent) +9. Select best solution + +--- + +## Phase 4: Specialist Agent Execution + +### Step 4.1: Target Metric Selection +**Tool**: `get_select_target_metric()` from `plexe/tools/metrics.py` + +**What Happens**: +- Orchestrator calls this tool +- Tool analyzes intent and output schema +- Determines this is a regression problem (output is float) +- Selects appropriate metric (e.g., RMSE, MAE) +- Registers metric in ObjectRegistry + +--- + +### Step 4.2: EDA Agent Execution +**File**: `plexe/agents/dataset_analyser.py` + +**What Happens**: +1. Orchestrator delegates task: "Analyze dataset_0" +2. EDA Agent receives task +3. Agent calls `get_latest_datasets()` tool: + - **File**: `plexe/tools/datasets.py` (lines 394-451) + - Retrieves `dataset_0` from ObjectRegistry + - Returns dict: `{"raw": "dataset_0"}` +4. Agent calls `get_dataset_preview()` tool: + - **File**: `plexe/tools/datasets.py` (lines 224-283) + - Gets dataset from registry: `object_registry.get(TabularConvertible, "dataset_0")` + - Converts to pandas: `dataset.to_pandas()` + - Generates preview with shape, dtypes, sample rows, statistics, missing values + - Returns preview dict +5. Agent analyzes data using LLM (via Provider): + - **File**: `plexe/internal/common/provider.py` + - Makes LLM call with system prompt + data preview + - LLM generates EDA insights +6. Agent calls `register_eda_report()` tool: + - **File**: `plexe/tools/datasets.py` (lines 287-341) + - Creates structured EDA report dict with: + - overview: dataset stats, target variable analysis + - feature_engineering_opportunities: transformation needs + - data_quality_challenges: data issues + - data_preprocessing_requirements: preprocessing steps + - feature_importance: predictive potential + - insights: key findings + - recommendations: actionable steps + - Registers in ObjectRegistry: `object_registry.register(dict, "eda_report_dataset_0", report)` + +**Output**: EDA report registered in ObjectRegistry + +--- + +### Step 4.3: Schema Resolution (Skipped) +**File**: `plexe/agents/schema_resolver.py` + +**What Happens**: +- Orchestrator checks if schemas are locked +- Finds `input_schema_is_locked = True` in registry +- Skips schema resolution (schemas already provided) +- If schemas weren't provided, this agent would: + - Analyze intent and sample data + - Infer input/output schemas + - Register them in ObjectRegistry + +--- + +### Step 4.4: ML Research Agent - Solution Planning +**File**: `plexe/agents/model_planner.py` + +**What Happens**: +1. Orchestrator delegates: "Plan ML solutions for house price prediction" +2. Agent retrieves context: + - Gets EDA report: `object_registry.get(dict, "eda_report_dataset_0")` + - Gets schemas from registry + - Gets datasets: `get_latest_datasets()` +3. Agent uses LLM to generate solution plans: + - Analyzes problem type (regression) + - Considers dataset characteristics from EDA + - Generates multiple solution approaches: + - Solution 1: Linear Regression + - Solution 2: Random Forest + - Solution 3: Gradient Boosting (XGBoost) + - Each solution includes: + - Model type and framework + - Rationale + - Expected performance + - Implementation approach +4. Agent registers solution plans in ObjectRegistry +5. **File**: `plexe/core/entities/solution.py` - Solution objects created + +**Output**: Multiple solution plans registered + +--- + +### Step 4.5: Feature Engineering Agent (Optional) +**File**: `plexe/agents/feature_engineer.py` + +**What Happens** (if orchestrator decides feature engineering is needed): +1. Agent retrieves EDA report and raw dataset +2. Analyzes feature engineering opportunities from EDA +3. Generates transformation code (e.g., normalization, encoding) +4. Applies transformations to create new dataset +5. Registers transformed dataset: `dataset_0_transformed` +6. Registers transformation code in ObjectRegistry +7. Calls `register_feature_engineering_report()`: + - **File**: `plexe/tools/datasets.py` (lines 344-390) + - Creates feature engineering report + - Registers in ObjectRegistry + +**Output**: Transformed dataset and feature engineering code (if applicable) + +--- + +### Step 4.6: Dataset Splitter Agent +**File**: `plexe/agents/dataset_splitter.py` + +**What Happens**: +1. Orchestrator delegates: "Split dataset into train/validation/test" +2. Agent calls `get_dataset_for_splitting()`: + - **File**: `plexe/tools/datasets.py` (lines 455-507) + - Finds best dataset (prefers transformed, falls back to raw) + - Returns "dataset_0" (or "dataset_0_transformed" if exists) +3. Agent retrieves dataset from registry +4. Agent generates splitting strategy using LLM: + - Analyzes dataset size and characteristics + - Decides split ratios (e.g., 70/15/15 or 80/10/10) + - Considers time-series vs random split needs +5. Agent generates splitting code (Python) +6. Agent executes splitting code to create splits +7. Agent calls `register_split_datasets()`: + - **File**: `plexe/tools/datasets.py` (lines 27-92) + - Registers three datasets: + - `dataset_0_train` (TabularConvertible) + - `dataset_0_val` (TabularConvertible) + - `dataset_0_test` (TabularConvertible) + - Registers splitting code: `object_registry.register(Code, "dataset_splitting_code", Code(code))` + +**Output**: Train/validation/test datasets registered in ObjectRegistry + +--- + +### Step 4.7: Model Trainer Agent - Training Implementation +**File**: `plexe/agents/model_trainer.py` + +**What Happens** (for each solution plan): +1. Orchestrator delegates: "Train Solution 1: Linear Regression" +2. Agent retrieves: + - Training datasets: `get_training_datasets()` → `{"train": "dataset_0_train", "validation": "dataset_0_val"}` + - Solution plan from registry + - EDA report and feature engineering report (if available) +3. Agent generates training code using LLM: + - **File**: Uses `tool_model_id` for code generation + - Creates Python code that: + - Loads train/val datasets + - Implements model (e.g., sklearn LinearRegression) + - Trains model on training set + - Evaluates on validation set + - Saves model artifacts + - Code is validated and formatted +4. Agent executes training code: + - **File**: `plexe/tools/execution.py` + - Code runs in isolated environment + - Model trains on `dataset_0_train` + - Validates on `dataset_0_val` + - Generates performance metric (e.g., RMSE = 25000.0) + - Saves model artifacts (pickle files, weights, etc.) +5. Agent registers results: + - **File**: `plexe/tools/training.py` + - Creates `Solution` object with: + - training_code: generated Python code string + - performance: Metric object (name="RMSE", value=25000.0) + - model_artifacts: List[Artifact] (file paths to saved models) + - Registers Solution in ObjectRegistry +6. Process repeats for each solution (Solution 2: Random Forest, Solution 3: XGBoost) + +**Output**: Multiple trained solutions with performance metrics in ObjectRegistry + +--- + +### Step 4.8: Model Packager Agent - Inference Code Generation +**File**: `plexe/agents/model_packager.py` + +**What Happens** (for each solution): +1. Orchestrator delegates: "Generate inference code for Solution 1" +2. Agent retrieves: + - Solution object with training code and artifacts + - Input/output schemas from registry + - Feature transformer code (if exists) +3. Agent generates inference code using LLM: + - Creates Python class `PredictorImplementation` that: + - Inherits from `Predictor` interface + - **File**: `plexe/core/interfaces/predictor.py` + - Implements `__init__(self, artifacts: List[Artifact])` + - Implements `predict(self, inputs: dict) -> dict` + - Code loads model artifacts + - Applies feature transformations (if any) + - Makes predictions + - Returns dict matching output schema +4. Agent validates inference code: + - **File**: `plexe/tools/validation.py` + - Creates sample inputs using `create_input_sample()` + - **File**: `plexe/tools/datasets.py` (lines 97-151) + - Tests prediction on sample inputs + - Validates output matches schema +5. Agent registers inference code: + - Updates Solution object with `inference_code` attribute + - Code stored as string in Solution + +**Output**: Inference code for each solution registered + +--- + +### Step 4.9: Model Tester Agent - Final Evaluation +**File**: `plexe/agents/model_tester.py` + +**What Happens**: +1. Orchestrator delegates: "Test all solutions on test set" +2. Agent retrieves: + - Test dataset: `get_test_dataset()` → "dataset_0_test" + - All solution objects from registry +3. For each solution: + - Agent generates testing code + - Code loads model and inference code + - Runs predictions on entire test set + - Calculates final performance metrics + - Generates evaluation report (confusion matrix, feature importance, etc.) +4. Agent updates Solution objects with: + - `testing_code`: testing code string + - `model_evaluation_report`: detailed evaluation dict + - Test performance metrics + +**Output**: All solutions tested and evaluated + +--- + +## Phase 5: Solution Selection and Finalization + +### Step 5.1: Best Solution Selection +**Tool**: `get_solution_performances()` from `plexe/tools/evaluation.py` + +**What Happens**: +1. Orchestrator calls tool to get all solution performances +2. Tool retrieves all Solution objects from ObjectRegistry +3. Extracts performance metrics from each +4. Returns comparison of all solutions + +**Tool**: `register_best_solution()` from `plexe/tools/training.py` + +**What Happens**: +1. Orchestrator compares all solutions +2. Selects best based on metric (e.g., lowest RMSE) +3. Registers best solution: `object_registry.register(Solution, "best_performing_solution", best_solution)` + +--- + +### Step 5.2: Final Response Formatting +**Tool**: `format_final_orchestrator_agent_response()` from `plexe/tools/response_formatting.py` + +**What Happens**: +1. Orchestrator calls tool to format final response +2. Tool collects: + - Best solution details + - Performance metrics + - Model metadata +3. Formats into structured response dict +4. Returns to orchestrator + +--- + +### Step 5.3: PlexeAgent.run() Completion +**File**: `plexe/agents/agents.py` (lines 196-305) + +**What Happens**: +1. Orchestrator finishes execution +2. `agent.run()` returns result (AgentText or dict) +3. Code extracts best solution (lines 217-219): + - `best_solution = object_registry.get(Solution, "best_performing_solution")` + - `training_code = best_solution.training_code` + - `inference_code = best_solution.inference_code` +4. Extracts performance metrics (lines 222-245): + - Gets metric from result or solution + - Creates `Metric` object with name, value, comparator +5. Compiles inference code (lines 251-255): + - Creates new module: `types.ModuleType("predictor")` + - Executes inference code string in module namespace + - Gets `PredictorImplementation` class from module + - Instantiates: `predictor = predictor_class(best_solution.model_artifacts)` +6. Extracts additional code (lines 258-291): + - Feature transformer code (if exists) + - Dataset split code (if exists) + - Testing code (if exists) + - Evaluation report (if exists) +7. Creates `ModelGenerationResult` (lines 293-305): + - training_source_code: Python code string + - inference_source_code: Python code string + - feature_transformer_source_code: Python code string (or None) + - dataset_split_code: Python code string (or None) + - predictor: Predictor instance (ready to use) + - model_artifacts: List[Artifact] + - performance: Metric object + - test_performance: Metric object + - testing_source_code: Python code string (or None) + - evaluation_report: Dict (or None) + - metadata: Dict with model info + +**Output**: `ModelGenerationResult` object returned to ModelBuilder + +--- + +## Phase 6: ModelBuilder Finalization + +### Step 6.1: Extract Final Schemas +**File**: `plexe/model_builder.py` (lines 189-192) + +**What Happens**: +- Calls `get_solution_schemas("best_performing_solution")` +- **File**: `plexe/tools/schemas.py` +- Retrieves final input/output schemas (may have been refined) +- Converts to Pydantic models using `map_to_basemodel()` + +--- + +### Step 6.2: Build Metadata +**File**: `plexe/model_builder.py` (lines 194-217) + +**What Happens**: +1. Creates metadata dict with provider info +2. Updates with metadata from `ModelGenerationResult` +3. Extracts EDA reports from ObjectRegistry: + - Gets `eda_report_dataset_0` from registry + - Formats as markdown using `format_eda_report_markdown()` + - Adds to metadata + +--- + +### Step 6.3: Create Model Instance +**File**: `plexe/model_builder.py` (lines 219-236) + +**What Happens**: +1. Imports `Model` class: `from plexe.models import Model` +2. Creates Model instance: + ```python + model = Model( + intent=intent, + input_schema=final_input_schema, + output_schema=final_output_schema + ) + ``` +3. **File**: `plexe/models.py` (lines 93-138) + - `__init__` executes: + - Stores intent, schemas + - Initializes `state = ModelState.DRAFT` + - Creates unique identifier + - Creates working directory + - Initializes empty predictor, artifacts, metrics +4. Populates model with results: + - `model.identifier = model_identifier` + - `model.predictor = generated.predictor` (Predictor instance) + - `model.trainer_source = generated.training_source_code` + - `model.predictor_source = generated.inference_source_code` + - `model.feature_transformer_source = generated.feature_transformer_source_code` + - `model.dataset_splitter_source = generated.dataset_split_code` + - `model.testing_source = generated.testing_source_code` + - `model.artifacts = generated.model_artifacts` + - `model.metric = generated.test_performance` + - `model.evaluation_report = generated.evaluation_report` + - `model.metadata.update(metadata)` + - `model.training_data = training_data` (actual Dataset objects) + - `model.state = ModelState.READY` + +--- + +### Step 6.4: Final Callback Notification +**File**: `plexe/model_builder.py` (lines 238-252) + +**What Happens**: +- Calls `_notify_callbacks()` with event="build_end" +- All callbacks receive final BuildStateInfo with: + - Completed model + - Final metrics + - Source code + - Artifacts + +--- + +### Step 6.5: Return Model +**File**: `plexe/model_builder.py` (line 254) + +**What Happens**: +- Returns completed `Model` instance to user +- Model is in `ModelState.READY` state +- All code, artifacts, and metadata populated + +--- + +## Phase 7: User Makes Prediction + +### Step 7.1: User Calls model.predict() +**File**: `plexe/models.py` (lines 227-245) + +**What Happens**: +1. User calls: `model.predict({"bedrooms": 3, "bathrooms": 2, "square_footage": 1500.0})` +2. `predict()` method executes: + - Checks `self.state == ModelState.READY` (line 235) + - If `validate_input=True`, validates input against `self.input_schema` (line 239) + - Calls `self.predictor.predict(x)` (line 240) + - **File**: `plexe/core/interfaces/predictor.py` + - This is the `PredictorImplementation` class compiled earlier + - Executes inference code: + - Loads model artifacts + - Applies feature transformations (if any) + - Runs model forward pass + - Returns prediction dict + - If `validate_output=True`, validates output against `self.output_schema` (line 242) + - Returns prediction dict: `{"price": 250000.0}` + +**Output**: Prediction dictionary matching output schema + +--- + +## Complete File Trace Summary + +### User-Facing Files: +1. `plexe/model_builder.py` - Main entry point +2. `plexe/models.py` - Model class definition + +### Core System Files: +3. `plexe/agents/agents.py` - Multi-agent orchestrator +4. `plexe/core/object_registry.py` - Shared state registry +5. `plexe/core/state.py` - Model state definitions +6. `plexe/core/interfaces/predictor.py` - Predictor interface + +### Agent Files: +7. `plexe/agents/model_planner.py` - Solution planning +8. `plexe/agents/dataset_analyser.py` - EDA analysis +9. `plexe/agents/schema_resolver.py` - Schema inference +10. `plexe/agents/feature_engineer.py` - Feature engineering +11. `plexe/agents/dataset_splitter.py` - Dataset splitting +12. `plexe/agents/model_trainer.py` - Model training +13. `plexe/agents/model_packager.py` - Inference code generation +14. `plexe/agents/model_tester.py` - Model testing + +### Tool Files: +15. `plexe/tools/datasets.py` - Dataset operations +16. `plexe/tools/metrics.py` - Metric selection +17. `plexe/tools/training.py` - Training operations +18. `plexe/tools/execution.py` - Code execution +19. `plexe/tools/validation.py` - Code validation +20. `plexe/tools/evaluation.py` - Model evaluation +21. `plexe/tools/schemas.py` - Schema operations +22. `plexe/tools/response_formatting.py` - Response formatting + +### Utility Files: +23. `plexe/internal/common/utils/pydantic_utils.py` - Schema conversion +24. `plexe/internal/common/provider.py` - LLM provider interface +25. `plexe/internal/common/datasets/adapter.py` - Dataset adapters +26. `plexe/core/entities/solution.py` - Solution data structures +27. `plexe/internal/models/entities/artifact.py` - Artifact definitions +28. `plexe/internal/models/entities/metric.py` - Metric definitions + +--- + +## Data Flow Diagram + +``` +User Input + ↓ +ModelBuilder.build() + ↓ +ObjectRegistry (cleared, then populated) + ↓ +PlexeAgent (orchestrator + 8 specialists) + ↓ +Orchestrator Agent (coordinates workflow) + ├─→ EDA Agent → EDA Report → Registry + ├─→ Schema Resolver → Schemas → Registry (if needed) + ├─→ ML Research Agent → Solution Plans → Registry + ├─→ Feature Engineer → Transformed Data → Registry (optional) + ├─→ Dataset Splitter → Train/Val/Test → Registry + ├─→ Model Trainer → Trained Models → Registry + ├─→ Model Packager → Inference Code → Registry + └─→ Model Tester → Evaluation Reports → Registry + ↓ +Best Solution Selected → Registry + ↓ +ModelGenerationResult (extracted from registry) + ↓ +Model Instance (populated with results) + ↓ +User receives Model (ready for predictions) + ↓ +model.predict() → Predictor.predict() → Prediction +``` + +--- + +This walkthrough covers every major file and code path in the system, showing how data flows from user input through the multi-agent system to final predictions. + diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py index ce313f05..4978bab5 100644 --- a/plexe/internal/models/entities/metric.py +++ b/plexe/internal/models/entities/metric.py @@ -22,8 +22,6 @@ from enum import Enum from functools import total_ordering -from typing import Optional -from weakref import WeakValueDictionary class ComparisonMethod(Enum): @@ -100,140 +98,32 @@ def compare(self, value1: float, value2: float) -> int: raise ValueError("Invalid comparison method.") -# Internal cache for sharing MetricComparator instances across all metrics -# This ensures only one comparator object exists per unique (method, target, epsilon) combination -_comparator_cache: WeakValueDictionary = WeakValueDictionary() - - -def _get_shared_comparator( - comparison_method: ComparisonMethod, target: Optional[float] = None, epsilon: float = 1e-9 -) -> MetricComparator: - """ - Get or create a shared MetricComparator instance. - - This function ensures that identical comparators are reused across all Metric instances, - reducing memory usage and ensuring consistency. - - :param comparison_method: The comparison method. - :param target: Optional target value for TARGET_IS_BETTER. - :param epsilon: Tolerance for floating-point comparisons. - :return: A shared MetricComparator instance. - """ - # Create a cache key from the comparator parameters - cache_key = (comparison_method, target, epsilon) - - # Try to get existing comparator from cache - if cache_key in _comparator_cache: - return _comparator_cache[cache_key] - - # Create new comparator and cache it - comparator = MetricComparator(comparison_method, target, epsilon) - _comparator_cache[cache_key] = comparator - return comparator - - -class _MetricDefinition: - """ - Internal class representing a metric type definition. - - This separates the metric definition (what it is) from the metric value (a measurement). - Metric definitions are immutable and can be shared across multiple metric values. - - This is an internal implementation detail - users should not interact with this class directly. - """ - - def __init__(self, name: str, comparator: MetricComparator): - """ - Initialize a metric definition. - - :param name: The name of the metric. - :param comparator: The shared comparator instance. - """ - self._name = name - self._comparator = comparator - - @property - def name(self) -> str: - """The name of the metric.""" - return self._name - - @property - def comparator(self) -> MetricComparator: - """The shared comparator instance.""" - return self._comparator - - def __eq__(self, other) -> bool: - """Check if two metric definitions are equal.""" - if not isinstance(other, _MetricDefinition): - return False - return ( - self.name == other.name - and self.comparator.comparison_method == other.comparator.comparison_method - and self.comparator.target == other.comparator.target - and self.comparator.epsilon == other.comparator.epsilon - ) - - def __hash__(self) -> int: - """Hash the metric definition.""" - return hash((self.name, self.comparator.comparison_method, self.comparator.target, self.comparator.epsilon)) - - +# todo: this class is a mess as it mixes concerns of a metric and a metric value; needs refactoring @total_ordering class Metric: """ Represents a metric with a name, a value, and a comparator for determining which metric is better. - This class internally separates the metric definition (type) from the metric value (measurement), - and automatically shares comparator instances to reduce memory usage. - Attributes: name (str): The name of the metric (e.g., 'accuracy', 'loss'). value (float): The numeric value of the metric. - comparator (MetricComparator): The comparison logic for the metric (shared instance). + comparator (MetricComparator): The comparison logic for the metric. """ def __init__(self, name: str, value: float = None, comparator: MetricComparator = None, is_worst: bool = False): """ Initializes a Metric object. - The comparator instance is automatically shared with other metrics that have the same - comparison method, target, and epsilon values, reducing memory usage. - :param name: The name of the metric. :param value: The numeric value of the metric. :param comparator: An instance of MetricComparator for comparison logic. :param is_worst: Indicates if the metric value is the worst possible value. """ - # Store the metric value (dynamic, instance-specific) + self.name = name self.value = value + self.comparator = comparator self.is_worst = is_worst or value is None - # Get or create a shared comparator instance - if comparator is not None: - # Use the shared comparator cache to ensure we reuse identical comparators - # This is the key optimization: identical comparators are shared across all metrics - shared_comparator = _get_shared_comparator( - comparison_method=comparator.comparison_method, target=comparator.target, epsilon=comparator.epsilon - ) - else: - # If no comparator provided, raise an error as it's required for a valid metric - # This maintains the same behavior as before - raise ValueError("Metric requires a comparator. Provide a MetricComparator instance.") - - # Create internal metric definition (separates type from value) - # This is the key separation: definition (what it is) vs value (measurement) - self._definition = _MetricDefinition(name=name, comparator=shared_comparator) - - @property - def name(self) -> str: - """The name of the metric (for backward compatibility).""" - return self._definition.name - - @property - def comparator(self) -> MetricComparator: - """The shared comparator instance (for backward compatibility).""" - return self._definition.comparator - def __gt__(self, other) -> bool: """ Determine if this metric is better than another metric. @@ -245,24 +135,23 @@ def __gt__(self, other) -> bool: if not isinstance(other, Metric): return NotImplemented - if self.is_worst: + if self.is_worst or (self.is_worst and other.is_worst): return False if other.is_worst: return True - # Compare using definitions - this is cleaner and ensures consistency - if self._definition != other._definition: - # Provide detailed error message for backward compatibility - if self.name != other.name: - raise ValueError("Cannot compare metrics with different names.") - if self.comparator.comparison_method != other.comparator.comparison_method: - raise ValueError("Cannot compare metrics with different comparison methods.") - if ( - self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER - and self.comparator.target != other.comparator.target - ): - raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.") + if self.name != other.name: + raise ValueError("Cannot compare metrics with different names.") + + if self.comparator.comparison_method != other.comparator.comparison_method: + raise ValueError("Cannot compare metrics with different comparison methods.") + + if ( + self.comparator.comparison_method == ComparisonMethod.TARGET_IS_BETTER + and self.comparator.target != other.comparator.target + ): + raise ValueError("Cannot compare 'TARGET_IS_BETTER' metrics with different target values.") return self.comparator.compare(self.value, other.value) < 0 @@ -282,8 +171,11 @@ def __eq__(self, other) -> bool: if self.is_worst or other.is_worst: return False - # Use definition equality for cleaner comparison - return self._definition == other._definition and self.comparator.compare(self.value, other.value) == 0 + return ( + self.name == other.name + and self.comparator.comparison_method == other.comparator.comparison_method + and self.comparator.compare(self.value, other.value) == 0 + ) def __repr__(self) -> str: """ @@ -319,4 +211,4 @@ def is_valid(self) -> bool: :return: True if the metric value is valid, False otherwise. """ - return self.value is not None and not (self.value != self.value) # NaN check + return self.value is not None and not (self.value != self.value) # NaN check \ No newline at end of file diff --git a/plexe/tools/datasets.py b/plexe/tools/datasets.py index 7f67487c..b72f9df6 100644 --- a/plexe/tools/datasets.py +++ b/plexe/tools/datasets.py @@ -8,15 +8,18 @@ """ import logging +import json from datetime import datetime from typing import Dict, List, Any import numpy as np import pandas as pd +from pydantic import BaseModel, ValidationError from smolagents import tool from plexe.internal.common.datasets.adapter import DatasetAdapter from plexe.internal.common.datasets.interface import TabularConvertible +from plexe.internal.common.provider import Provider from plexe.core.object_registry import ObjectRegistry from plexe.internal.models.entities.code import Code @@ -79,7 +82,7 @@ def register_split_datasets( dataset_sizes["validation"].append(len(val_ds)) dataset_sizes["test"].append(len(test_ds)) - logger.debug( + logger.debug( f"✅ Registered custom split of dataset {dataset_name} into train/validation/test with sizes " f"{len(train_ds)}/{len(val_ds)}/{len(test_ds)}" ) @@ -109,35 +112,78 @@ def create_input_sample(n_samples: int = 5) -> bool: input_schema = object_registry.get(dict, "input_schema") try: - # Create synthetic sample data that matches the schema + # Create a Pydantic model dynamically from the schema for validation + schema_fields = {} + for field_name, field_type in input_schema.items(): + # Map string type names to Python types + type_mapping = { + "int": int, + "float": float, + "bool": bool, + "str": str, + "List[int]": List[int], + "List[float]": List[float], + "List[bool]": List[bool], + "List[str]": List[str], + } + python_type = type_mapping.get(field_type, Any) + schema_fields[field_name] = (python_type, ...) + + SampleModel = type("SampleModel", (BaseModel,), schema_fields) + + # Use LLM to generate sensible sample values based on field names and types + provider = Provider("openai/gpt-4o-mini") input_sample_dicts = [] - # Generate synthetic examples + # Generate samples using LLM for i in range(n_samples): - sample = {} - for field_name, field_type in input_schema.items(): - # Generate appropriate sample values based on type - if field_type == "int": - sample[field_name] = i * 10 - elif field_type == "float": - sample[field_name] = i * 10.5 - elif field_type == "bool": - sample[field_name] = i % 2 == 0 - elif field_type == "str": - sample[field_name] = f"sample_{field_name}_{i}" - elif field_type == "List[int]": - sample[field_name] = [i * 10, i * 20, i * 30] - elif field_type == "List[float]": - sample[field_name] = [i * 10.5, i * 20.5, i * 30.5] - elif field_type == "List[bool]": - sample[field_name] = [True, False, i % 2 == 0] - elif field_type == "List[str]": - sample[field_name] = [f"item_{i}_1", f"item_{i}_2", f"item_{i}_3"] - else: - sample[field_name] = None - input_sample_dicts.append(sample) - - # TODO: we should use an LLM call to generate sensible values; then validate using pydantic + # Create prompt for LLM to generate realistic sample + schema_description = ", ".join([f"{name}: {type_str}" for name, type_str in input_schema.items()]) + user_prompt = f"""Generate a realistic sample data point for a machine learning model with the following schema: +{schema_description} + +Generate realistic, sensible values that make sense for each field name. For example: +- If field is "age", use a realistic age like 25 or 45, not 1000 or -5 +- If field is "email", use a realistic email format like "user@example.com" +- If field is "price", use a realistic price value +- Make values diverse across different samples""" + + try: + # Get LLM response with Pydantic validation + response_str = provider.query( + system_message="You are a data generation assistant. Generate realistic sample data that matches the given schema.", + user_message=user_prompt, + response_format=SampleModel, + ) + + # Parse and validate the response using Pydantic + response_dict = json.loads(response_str) if isinstance(response_str, str) else response_str + validated_sample = SampleModel(**response_dict) + input_sample_dicts.append(validated_sample.model_dump()) + except (json.JSONDecodeError, ValidationError, Exception) as e: + # Fallback to simple synthetic generation if LLM fails + logger.warning(f"LLM generation failed for sample {i}, using fallback: {str(e)}") + sample = {} + for field_name, field_type in input_schema.items(): + if field_type == "int": + sample[field_name] = i * 10 + elif field_type == "float": + sample[field_name] = i * 10.5 + elif field_type == "bool": + sample[field_name] = i % 2 == 0 + elif field_type == "str": + sample[field_name] = f"sample_{field_name}_{i}" + elif field_type == "List[int]": + sample[field_name] = [i * 10, i * 20, i * 30] + elif field_type == "List[float]": + sample[field_name] = [i * 10.5, i * 20.5, i * 30.5] + elif field_type == "List[bool]": + sample[field_name] = [True, False, i % 2 == 0] + elif field_type == "List[str]": + sample[field_name] = [f"item_{i}_1", f"item_{i}_2", f"item_{i}_3"] + else: + sample[field_name] = None + input_sample_dicts.append(sample) # Register the input sample in the registry for validation tool to use object_registry.register(list, "predictor_input_sample", input_sample_dicts, overwrite=True, immutable=True) From cbcee7c9559d2434ed59cdb5d8fea69f109cf9ce Mon Sep 17 00:00:00 2001 From: aumpatel Date: Tue, 2 Dec 2025 17:19:56 -0500 Subject: [PATCH 5/5] style: format metric.py and datasets.py with black - Fix black formatting issues for CI compliance - Format both files to match project style standards --- plexe/internal/models/entities/metric.py | 2 +- plexe/tools/datasets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plexe/internal/models/entities/metric.py b/plexe/internal/models/entities/metric.py index 4978bab5..f5355984 100644 --- a/plexe/internal/models/entities/metric.py +++ b/plexe/internal/models/entities/metric.py @@ -211,4 +211,4 @@ def is_valid(self) -> bool: :return: True if the metric value is valid, False otherwise. """ - return self.value is not None and not (self.value != self.value) # NaN check \ No newline at end of file + return self.value is not None and not (self.value != self.value) # NaN check diff --git a/plexe/tools/datasets.py b/plexe/tools/datasets.py index b72f9df6..5e906062 100644 --- a/plexe/tools/datasets.py +++ b/plexe/tools/datasets.py @@ -82,7 +82,7 @@ def register_split_datasets( dataset_sizes["validation"].append(len(val_ds)) dataset_sizes["test"].append(len(test_ds)) - logger.debug( + logger.debug( f"✅ Registered custom split of dataset {dataset_name} into train/validation/test with sizes " f"{len(train_ds)}/{len(val_ds)}/{len(test_ds)}" )