From aec7c4b75b5935cccf9ce8c5da329d4e13303c57 Mon Sep 17 00:00:00 2001 From: eosho Date: Tue, 30 Dec 2025 12:23:54 -0500 Subject: [PATCH 1/2] refactor: streamline visualization configuration and executor selection --- docs/CONFIGURATION.md | 25 ++--- docs/VISUALIZATION.md | 21 ++-- pyproject.toml | 1 + scripts/generate_diagrams.py | 63 +++++++++++ src/data_agent/config.py | 66 +++++------- src/data_agent/config/amex.yaml | 5 - .../config/schema/agent_config.schema.json | 19 ---- src/data_agent/config_loader.py | 4 - src/data_agent/executors/__init__.py | 61 ++++------- src/data_agent/executors/azure_sessions.py | 16 +-- src/data_agent/executors/base.py | 8 +- src/data_agent/executors/local.py | 100 ++++++++++++++++++ src/data_agent/graph.py | 16 ++- src/data_agent/ui/app.py | 4 - uv.lock | 13 ++- 15 files changed, 250 insertions(+), 172 deletions(-) create mode 100644 scripts/generate_diagrams.py create mode 100644 src/data_agent/executors/local.py diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index aaa17ba..f2f2c27 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -35,8 +35,6 @@ data_agents: blocked_functions: - pg_sleep - pg_read_file - code_interpreter: - enabled: true system_prompt: | You are an SQL assistant... {schema_context} @@ -58,22 +56,21 @@ data_agents: ## Code Interpreter (Data Visualization) -Enable the code interpreter to generate charts and visualizations from query results. When enabled, the LLM can detect visualization intent (e.g., "show me a chart", "visualize", "plot") and generate matplotlib code to create charts. +The data agent can generate charts and visualizations from query results. When the LLM detects visualization intent (e.g., "show me a chart", "visualize", "plot"), it generates matplotlib code to create charts. -```yaml -code_interpreter: - enabled: true - azure_sessions_endpoint: ${AZURE_SESSIONS_POOL_ENDPOINT} -``` +Visualization is **automatically enabled** - no YAML configuration needed. The executor is selected based on environment: -| Setting | Description | Default | -|---------|-------------|---------| -| `enabled` | Enable/disable visualization generation | `false` | -| `azure_sessions_endpoint` | Azure Container Apps session pool management endpoint URL | - | +| Environment | Executor | Use Case | +|-------------|----------|----------| +| `AZURE_SESSIONS_POOL_ENDPOINT` set | Azure Sessions | Production (secure, Hyper-V isolation) | +| Not set | Local executor | Development (no sandboxing) | -**Note:** Visualization requires Azure Container Apps Dynamic Sessions for secure, isolated code execution. +```bash +# Production: Set the Azure Sessions endpoint +export AZURE_SESSIONS_POOL_ENDPOINT="https://eastus.dynamicsessions.io/subscriptions/.../sessionPools/..." +``` -See [VISUALIZATION.md](VISUALIZATION.md) for complete setup instructions, architecture details, and troubleshooting. +See [VISUALIZATION.md](VISUALIZATION.md) for Azure setup instructions and troubleshooting. ## SQL Validation diff --git a/docs/VISUALIZATION.md b/docs/VISUALIZATION.md index 0797446..531a854 100644 --- a/docs/VISUALIZATION.md +++ b/docs/VISUALIZATION.md @@ -120,23 +120,16 @@ Or in `.env`: AZURE_SESSIONS_POOL_ENDPOINT=https://eastus.dynamicsessions.io/subscriptions/.../sessionPools/... ``` -### YAML Configuration +### Executor Selection -Enable visualization in your agent config: +The system automatically selects the executor based on environment: -```yaml -data_agents: - - name: "sales_agent" - # ... other config ... - code_interpreter: - enabled: true - azure_sessions_endpoint: ${AZURE_SESSIONS_POOL_ENDPOINT} -``` +| `AZURE_SESSIONS_POOL_ENDPOINT` | Executor | Use Case | +|-------------------------------|----------|----------| +| Set | Azure Sessions | Production (secure, Hyper-V isolation) | +| Not set | Local Python REPL | Development (fast, no sandboxing) | -| Setting | Description | Default | -|---------|-------------|---------| -| `enabled` | Enable/disable visualization | `false` | -| `azure_sessions_endpoint` | Session pool management endpoint URL | - | +**No YAML configuration needed** - visualization is always enabled, with the executor determined by environment. ### System Prompt diff --git a/pyproject.toml b/pyproject.toml index 840305c..612dca7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ dependencies = [ "psycopg2>=2.9.11", "langchain-azure-dynamic-sessions>=0.2.0", "matplotlib>=3.10.8", + "tabulate>=0.9.0", ] [project.scripts] diff --git a/scripts/generate_diagrams.py b/scripts/generate_diagrams.py new file mode 100644 index 0000000..da68832 --- /dev/null +++ b/scripts/generate_diagrams.py @@ -0,0 +1,63 @@ +"""Generate flow diagrams for documentation using LangGraph visualization. + +This script generates PNG images for the data agent and intent detection flows +using LangGraph's built-in visualization. + +Usage: + uv run python scripts/generate_diagrams.py +""" + +import os +from pathlib import Path + +from dotenv import load_dotenv + +load_dotenv() + + +def main(): + """Generate diagrams from LangGraph and save to docs folder.""" + from unittest.mock import MagicMock + + from langchain_openai import AzureChatOpenAI + + from data_agent.config import CONFIG_DIR + from data_agent.config_loader import ConfigLoader + from data_agent.graph import DataAgentGraph + + docs_dir = Path(__file__).parent.parent / "docs" + docs_dir.mkdir(exist_ok=True) + + # Load a config to get a data agent graph + config = ConfigLoader.load(CONFIG_DIR / "amex.yaml") + + if config.data_agents: + agent_config = config.data_agents[0] + + # Create LLM + llm = AzureChatOpenAI( + azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), + api_key=os.getenv("AZURE_OPENAI_API_KEY"), + azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt-4o"), + api_version="2024-08-01-preview", + temperature=0, + ) + + # Create a mock datasource for diagram generation (we won't execute queries) + mock_datasource = MagicMock() + + # Build the graph and compile to get visualization + graph_builder = DataAgentGraph(llm, mock_datasource, agent_config) + compiled_graph = graph_builder.compile() + + # Generate PNG using LangGraph's visualization + png_data = compiled_graph.get_graph().draw_mermaid_png() + output_path = docs_dir / "data_agent_graph.png" + output_path.write_bytes(png_data) + print(f"Generated: {output_path}") + + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/src/data_agent/config.py b/src/data_agent/config.py index e411e25..173cff0 100644 --- a/src/data_agent/config.py +++ b/src/data_agent/config.py @@ -276,6 +276,31 @@ class BigQueryDatasource(BaseSettings): | BigQueryDatasource ) + +class VisualizationSettings(BaseSettings): + """Settings for code execution/visualization. + + Attributes: + azure_sessions_pool_endpoint: Azure Container Apps session pool endpoint. + If set, uses secure Azure Sessions. Otherwise falls back to local Python REPL. + """ + + model_config = SettingsConfigDict( + env_file=".env", + extra="ignore", + ) + + azure_sessions_pool_endpoint: str | None = Field( + default=None, + description="Azure Container Apps session pool endpoint for secure code execution.", + ) + + @property + def use_azure_sessions(self) -> bool: + """Check if Azure Sessions should be used.""" + return self.azure_sessions_pool_endpoint is not None + + DATASOURCE_TYPES: dict[str, type[Datasource]] = { "databricks": DatabricksDatasource, "cosmos": CosmosDatasource, @@ -322,44 +347,6 @@ def from_dict(cls, data: dict[str, Any]) -> "ValidationConfig": ) -@dataclass -class CodeInterpreterConfig: - """Configuration for code interpreter / visualization feature. - - Visualization requires deploying an Azure Container Apps session pool - for secure, isolated code execution. See docs/CONFIGURATION.md. - - Attributes: - enabled: Whether to enable the code interpreter feature. - azure_sessions_endpoint: Pool management endpoint for Azure Sessions. - Can also be set via AZURE_SESSIONS_POOL_ENDPOINT environment variable. - """ - - enabled: bool = False - azure_sessions_endpoint: str | None = None - - def __post_init__(self) -> None: - """Validate configuration.""" - import os - - if self.enabled: - endpoint = self.azure_sessions_endpoint or os.getenv( - "AZURE_SESSIONS_POOL_ENDPOINT" - ) - if not endpoint: - raise ValueError( - "azure_sessions_endpoint is required when code_interpreter is enabled. " - "Set in config or via AZURE_SESSIONS_POOL_ENDPOINT environment variable." - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "CodeInterpreterConfig": - return cls( - enabled=data.get("enabled", False), - azure_sessions_endpoint=data.get("azure_sessions_endpoint"), - ) - - @dataclass class DataAgentConfig: """Configuration for a single data agent.""" @@ -369,9 +356,6 @@ class DataAgentConfig: datasource: Datasource | None = None llm_config: LLMConfig = field(default_factory=LLMConfig) validation_config: ValidationConfig = field(default_factory=ValidationConfig) - code_interpreter: CodeInterpreterConfig = field( - default_factory=CodeInterpreterConfig - ) system_prompt: str = "" response_prompt: str = "" table_schemas: list[TableSchema] = field(default_factory=list) diff --git a/src/data_agent/config/amex.yaml b/src/data_agent/config/amex.yaml index f7650e1..c8ec50a 100644 --- a/src/data_agent/config/amex.yaml +++ b/src/data_agent/config/amex.yaml @@ -42,11 +42,6 @@ data_agents: blocked_functions: - session_user - external_query - # Enable code interpreter for data visualization - # Requires Azure Container Apps Dynamic Sessions for secure execution - code_interpreter: - enabled: true - azure_sessions_endpoint: https://eastus.dynamicsessions.io/subscriptions/e98a7bdd-1e97-452c-939c-4edf569d31f6/resourceGroups/fresh-mcp-rg/sessionPools/session-pool-viz system_prompt: | You are an expert SQL assistant for a financial transactions database running on Google BigQuery. diff --git a/src/data_agent/config/schema/agent_config.schema.json b/src/data_agent/config/schema/agent_config.schema.json index 21c6c0a..98e601f 100644 --- a/src/data_agent/config/schema/agent_config.schema.json +++ b/src/data_agent/config/schema/agent_config.schema.json @@ -488,22 +488,6 @@ } } }, - "code_interpreter_config": { - "type": "object", - "description": "Code interpreter configuration for data visualization. Requires Azure Container Apps Dynamic Sessions for secure, isolated code execution.", - "additionalProperties": false, - "properties": { - "enabled": { - "type": "boolean", - "description": "Enable code interpreter for visualization. Requires azure_sessions_endpoint to be configured.", - "default": false - }, - "azure_sessions_endpoint": { - "type": "string", - "description": "Azure Container Apps session pool management endpoint URL. Required when enabled. Can also be set via AZURE_SESSIONS_POOL_ENDPOINT environment variable." - } - } - }, "data_agent_config": { "type": "object", "description": "Configuration for a single data agent", @@ -526,9 +510,6 @@ "validation": { "$ref": "#/$defs/validation_config" }, - "code_interpreter": { - "$ref": "#/$defs/code_interpreter_config" - }, "system_prompt": { "type": "string", "description": "System prompt for SQL generation" diff --git a/src/data_agent/config_loader.py b/src/data_agent/config_loader.py index 9e1a28e..c109912 100644 --- a/src/data_agent/config_loader.py +++ b/src/data_agent/config_loader.py @@ -16,7 +16,6 @@ CONFIG_DIR, DATASOURCE_TYPES, AgentConfig, - CodeInterpreterConfig, DataAgentConfig, Datasource, FewShotExample, @@ -167,9 +166,6 @@ def _parse_data_agent(cls, data: dict[str, Any]) -> DataAgentConfig: datasource=cls._parse_datasource(data.get("datasource")), llm_config=LLMConfig.from_dict(data.get("llm", {})), validation_config=ValidationConfig.from_dict(data.get("validation", {})), - code_interpreter=CodeInterpreterConfig.from_dict( - data.get("code_interpreter", {}) - ), system_prompt=data.get("system_prompt", ""), response_prompt=data.get("response_prompt", ""), table_schemas=[ diff --git a/src/data_agent/executors/__init__.py b/src/data_agent/executors/__init__.py index ccf95c0..2db9355 100644 --- a/src/data_agent/executors/__init__.py +++ b/src/data_agent/executors/__init__.py @@ -1,30 +1,10 @@ -"""Code execution backends for sandboxed Python execution. +"""Code execution backends for sandboxed Python execution.""" -This module provides the executor for running LLM-generated Python code -in Azure Container Apps Dynamic Sessions with Hyper-V isolation. - -Usage: - from data_agent.executors import create_executor - - # Create from configuration - executor = create_executor(config.code_interpreter) - - # Execute code - result = await executor.execute("print('Hello')") - if result.success: - print(result.output) - -Note: - Visualization support requires deploying an Azure Container Apps - session pool. See docs/CONFIGURATION.md for setup instructions. -""" - -from typing import TYPE_CHECKING +import logging from data_agent.executors.base import CodeExecutor, ExecutionResult, ExecutionStatus -if TYPE_CHECKING: - from data_agent.config import CodeInterpreterConfig +logger = logging.getLogger(__name__) __all__ = [ "CodeExecutor", @@ -34,26 +14,27 @@ ] -def create_executor(config: "CodeInterpreterConfig") -> CodeExecutor: - """Create a code executor based on configuration. - - Args: - config: CodeInterpreterConfig with endpoint settings. +def create_executor() -> CodeExecutor: + """Create a code executor based on environment configuration. Returns: - Configured AzureSessionsExecutor instance. - - Raises: - ValueError: If azure_sessions_endpoint is not configured. - TypeError: If config is not a CodeInterpreterConfig. + Configured CodeExecutor instance. """ - from data_agent.config import CodeInterpreterConfig + from data_agent.config import VisualizationSettings + + settings = VisualizationSettings() - if not isinstance(config, CodeInterpreterConfig): - raise TypeError(f"Expected CodeInterpreterConfig, got {type(config)}") + if settings.use_azure_sessions: + from data_agent.executors.azure_sessions import AzureSessionsExecutor - from data_agent.executors.azure_sessions import AzureSessionsExecutor + logger.info("Using Azure Sessions executor") + return AzureSessionsExecutor( + pool_management_endpoint=settings.azure_sessions_pool_endpoint, + ) + else: + from data_agent.executors.local import LocalExecutor - return AzureSessionsExecutor( - pool_management_endpoint=config.azure_sessions_endpoint, - ) + logger.info( + "Using local Python REPL executor (development only, no sandboxing)" + ) + return LocalExecutor() diff --git a/src/data_agent/executors/azure_sessions.py b/src/data_agent/executors/azure_sessions.py index e78da45..f91a8fd 100644 --- a/src/data_agent/executors/azure_sessions.py +++ b/src/data_agent/executors/azure_sessions.py @@ -23,21 +23,7 @@ class AzureSessionsExecutor(CodeExecutor): - """Execute code in Azure Container Apps dynamic sessions with Hyper-V isolation. - - The tool uses DefaultAzureCredential internally for authentication. - Ensure you have the "Azure ContainerApps Session Executor" role assigned. - - Configuration: - pool_management_endpoint: Session pool endpoint URL - Can be set via config or AZURE_SESSIONS_POOL_ENDPOINT env var - - Example: - executor = AzureSessionsExecutor( - pool_management_endpoint="https://eastus.dynamicsessions.io/..." - ) - result = await executor.execute("print('Hello from Azure!')") - """ + """Execute code in Azure Container Apps dynamic sessions with Hyper-V isolation.""" def __init__( self, diff --git a/src/data_agent/executors/base.py b/src/data_agent/executors/base.py index cfa7042..619cccc 100644 --- a/src/data_agent/executors/base.py +++ b/src/data_agent/executors/base.py @@ -45,11 +45,9 @@ def success(self) -> bool: class CodeExecutor(ABC): """Abstract base class for code execution backends. - Example: - executor = AzureSessionsExecutor(pool_management_endpoint="https://...") - result = await executor.execute("print('Hello')") - if result.success: - print(result.output) + Implementations: + - AzureSessionsExecutor: Production (Hyper-V isolation via Azure Container Apps) + - LocalExecutor: Development (no sandboxing, uses exec()) """ @abstractmethod diff --git a/src/data_agent/executors/local.py b/src/data_agent/executors/local.py new file mode 100644 index 0000000..88e48ff --- /dev/null +++ b/src/data_agent/executors/local.py @@ -0,0 +1,100 @@ +"""Local Python executor for development. + +Warning: + The local executor runs code directly on the host machine without sandboxing. + Only use in development environments with trusted code generation. +""" + +import io +import logging +from contextlib import redirect_stdout + +from data_agent.executors.base import CodeExecutor, ExecutionResult, ExecutionStatus + +logger = logging.getLogger(__name__) + + +class LocalExecutor(CodeExecutor): + """Local Python executor for development. + + Executes Python code using exec(). Captures matplotlib output + by hooking plt.show() to save figures to a buffer. + """ + + def __init__(self) -> None: + """Initialize the local executor.""" + logger.warning( + "LocalExecutor runs code without sandboxing. Use only in development." + ) + + async def execute(self, code: str, timeout: float = 30.0) -> ExecutionResult: + """Execute Python code locally. + + Args: + code: Python code to execute. + timeout: Execution timeout in seconds (not enforced locally). + + Returns: + ExecutionResult with output, status, and any captured image. + """ + # Set up execution environment + exec_globals: dict = {} + output_buffer = io.StringIO() + image_buffer = io.BytesIO() + image_captured = False + + # Set up matplotlib with Agg backend and custom show + setup_code = """ +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +""" + try: + exec(setup_code, exec_globals) + except Exception as e: + return ExecutionResult( + status=ExecutionStatus.ERROR, + output="", + error=f"Failed to set up matplotlib: {e}", + ) + + # Create custom show function that captures the figure + def capture_show(*args, **kwargs): + nonlocal image_captured + plt = exec_globals.get("plt") + if plt: + image_buffer.seek(0) + image_buffer.truncate() + plt.savefig(image_buffer, format="png", dpi=150, bbox_inches="tight") + image_buffer.seek(0) + image_captured = True + plt.close("all") + + # Replace plt.show with our capture function + exec_globals["plt"].show = capture_show + + try: + with redirect_stdout(output_buffer): + exec(code, exec_globals) + + output = output_buffer.getvalue() + + if image_captured: + return ExecutionResult( + status=ExecutionStatus.SUCCESS, + output=output, + files={"visualization.png": image_buffer.getvalue()}, + ) + + return ExecutionResult( + status=ExecutionStatus.SUCCESS, + output=output, + ) + + except Exception as e: + logger.exception("Local execution failed") + return ExecutionResult( + status=ExecutionStatus.ERROR, + output=output_buffer.getvalue(), + error=str(e), + ) diff --git a/src/data_agent/graph.py b/src/data_agent/graph.py index f9c050a..b1e44d5 100644 --- a/src/data_agent/graph.py +++ b/src/data_agent/graph.py @@ -64,11 +64,9 @@ def __init__( self._nodes = DataAgentNodes(llm, datasource, config, max_retries) self._response_node = ResponseNode(llm, config) - # Initialize visualization node if code_interpreter is enabled - self._viz_node: VisualizationNode | None = None - if config.code_interpreter.enabled: - executor = create_executor(config.code_interpreter) - self._viz_node = VisualizationNode(llm, executor) + # Initialize visualization node + executor = create_executor() + self._viz_node = VisualizationNode(llm, executor) def _should_retry(self, state: AgentState) -> str: """Determine if SQL generation should be retried. @@ -99,7 +97,7 @@ def _route_after_execute(self, state: AgentState) -> str: """ if state.get("error"): return "error" - if self._viz_node and state.get("visualization_requested", False): + if state.get("visualization_requested", False): return "visualize" return "respond" @@ -118,8 +116,7 @@ def build(self) -> StateGraph: graph.add_node("retry_sql", self._nodes.retry_sql) graph.add_node("execute_query", self._nodes.execute_query) graph.add_node("generate_response", self._response_node.generate_response) - if self._viz_node: - graph.add_node("visualize_data", self._viz_node.generate_visualization) + graph.add_node("visualize_data", self._viz_node.generate_visualization) graph.set_entry_point("generate_sql") @@ -139,8 +136,7 @@ def build(self) -> StateGraph: "respond": "generate_response", }, ) - if self._viz_node: - graph.add_edge("visualize_data", "generate_response") + graph.add_edge("visualize_data", "generate_response") graph.add_edge("generate_response", END) return graph diff --git a/src/data_agent/ui/app.py b/src/data_agent/ui/app.py index c0c5795..cfd9830 100644 --- a/src/data_agent/ui/app.py +++ b/src/data_agent/ui/app.py @@ -219,13 +219,9 @@ async def on_message(message: cl.Message): if isinstance(query_result, dict): rows = query_result.get("rows", []) columns = query_result.get("columns", []) - row_count = query_result.get("row_count") or len(rows) if rows else 0 else: rows = getattr(query_result, "rows", []) columns = getattr(query_result, "columns", []) - row_count = ( - getattr(query_result, "row_count", None) or len(rows) if rows else 0 - ) if columns and rows: df = pd.DataFrame(rows, columns=columns) diff --git a/uv.lock b/uv.lock index 7adf29a..6dda5dd 100644 --- a/uv.lock +++ b/uv.lock @@ -819,7 +819,7 @@ wheels = [ [[package]] name = "data-agent" -version = "0.5.0" +version = "0.3.0" source = { editable = "." } dependencies = [ { name = "a2a-sdk", extra = ["http-server"] }, @@ -854,6 +854,7 @@ dependencies = [ { name = "sqlalchemy-bigquery" }, { name = "sqlglot", extra = ["rs"] }, { name = "structlog" }, + { name = "tabulate" }, { name = "typer" }, { name = "typing-extensions" }, { name = "uvicorn" }, @@ -926,6 +927,7 @@ requires-dist = [ { name = "sqlalchemy-bigquery", specifier = ">=1.16.0" }, { name = "sqlglot", extras = ["rs"], specifier = ">=26.0.0" }, { name = "structlog", specifier = ">=24.0.0" }, + { name = "tabulate", specifier = ">=0.9.0" }, { name = "typer", specifier = ">=0.15.0" }, { name = "typing-extensions", specifier = ">=4.12" }, { name = "uvicorn", specifier = ">=0.38.0" }, @@ -4846,6 +4848,15 @@ version = "2.0.3" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/8d/dd/d4dd75843692690d81f0a4b929212a1614b25d4896aa7c72f4c3546c7e3d/syncer-2.0.3.tar.gz", hash = "sha256:4340eb54b54368724a78c5c0763824470201804fe9180129daf3635cb500550f", size = 11512, upload-time = "2023-05-08T07:50:17.963Z" } +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "tenacity" version = "9.1.2" From a51bfcfc03a25f9dfed42b8e9754ad744b513da9 Mon Sep 17 00:00:00 2001 From: eosho Date: Tue, 30 Dec 2025 12:40:48 -0500 Subject: [PATCH 2/2] refactor: update README and visualization documentation, improve code execution logging --- README.md | 19 ++-- docs/VISUALIZATION.md | 137 +++++--------------------- src/data_agent/executors/__init__.py | 9 +- src/data_agent/nodes/visualization.py | 22 ++--- 4 files changed, 41 insertions(+), 146 deletions(-) diff --git a/README.md b/README.md index 3486cd6..4444457 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -
+
```diff + ╔╦╗╔═╗╔╦╗╔═╗ ╔═╗╔═╗╔═╗╔╗╔╔╦╗ + ║║╠═╣ ║ ╠═╣ ╠═╣║ ╦║╣ ║║║ ║ + ═╩╝╩ ╩ ╩ ╩ ╩ ╩ ╩╚═╝╚═╝╝╚╝ ╩ -+ -+ Natural Language → SQL Query Agent + +[ Natural Language → SQL Query Agent ] ```
@@ -64,11 +64,11 @@ Generates, validates, and executes SQL queries with retry logic. ### Installation ```bash -git clone +git clone https://github.com/eosho/langchain_data_agent cd langchain_data_agent -uv sync +uv sync --all-extras cp .env.example .env -# Edit .env with your Azure OpenAI credentials +# Edit .env with your values ``` ### CLI Usage @@ -238,7 +238,7 @@ The platform includes built-in configuration for these databases: | PostgreSQL | `postgres` | postgres | | Azure SQL | `azure_sql` | tsql | | Azure Synapse | `synapse` | tsql | -| Azure Cosmos DB | `cosmos` | tsql | +| Azure Cosmos DB | `cosmos` | cosmosdb | | Databricks SQL | `databricks` | databricks | | Google BigQuery | `bigquery` | bigquery | | MySQL | `mysql` | mysql | @@ -250,13 +250,10 @@ The platform includes built-in configuration for these databases: ```bash # Format and lint -uv run poe format +uv run pre-commit run --all-files # Run tests uv run pytest - -# Type check -uv run basedpyright ``` ## License diff --git a/docs/VISUALIZATION.md b/docs/VISUALIZATION.md index 531a854..ef962d0 100644 --- a/docs/VISUALIZATION.md +++ b/docs/VISUALIZATION.md @@ -27,84 +27,17 @@ Visualization requires Azure Container Apps Dynamic Sessions. This provides: ## Azure Setup -### 1. Create a Container Apps Environment +Follow the [Azure Container Apps Dynamic Sessions with LangChain tutorial](https://learn.microsoft.com/en-us/azure/container-apps/sessions-tutorial-langchain) to: -If you don't already have one: +1. Create a Container Apps session pool +2. Get the pool management endpoint +3. Assign the `Azure ContainerApps Session Executor` role to your identity -```bash -az containerapp env create \ - --name aca-env \ - --resource-group rg-data-agent \ - --location eastus -``` - -### 2. Create the Session Pool - -```bash -az containerapp sessionpool create \ - --name session-pool-viz \ - --resource-group rg-data-agent \ - --container-type PythonLTS \ - --max-sessions 100 \ - --cooldown-period 300 \ - --location eastus -``` - -**Parameters:** -- `--container-type PythonLTS`: Python runtime with common data science packages -- `--max-sessions`: Maximum concurrent sessions -- `--cooldown-period`: Seconds before idle session is terminated - -### 3. Get the Pool Management Endpoint - -```bash -az containerapp sessionpool show \ - --name session-pool-viz \ - --resource-group rg-data-agent \ - --query "properties.poolManagementEndpoint" -o tsv -``` - -This returns a URL like: +Once complete, you'll have an endpoint URL like: ``` https://eastus.dynamicsessions.io/subscriptions//resourceGroups//sessionPools/ ``` -### 4. Assign the Executor Role - -Grant your identity permission to execute code in the session pool: - -```bash -# Get your user ID -USER_ID=$(az ad signed-in-user show --query id -o tsv) - -# Get the session pool resource ID -POOL_ID=$(az containerapp sessionpool show \ - --name session-pool-viz \ - --resource-group rg-data-agent \ - --query id -o tsv) - -# Assign the role -az role assignment create \ - --role "Azure ContainerApps Session Executor" \ - --assignee $USER_ID \ - --scope $POOL_ID -``` - -**Note:** For service principals or managed identities, replace `$USER_ID` with the appropriate object ID. - -### 5. Install the SDK - -```bash -pip install langchain-azure-dynamic-sessions -``` - -Or add to your `pyproject.toml`: -```toml -dependencies = [ - "langchain-azure-dynamic-sessions>=0.1.0", -] -``` - ## Configuration ### Environment Variable @@ -152,43 +85,23 @@ system_prompt: | ## How It Works -``` -┌─────────────────────────────────────────────────────────────────┐ -│ User Query │ -│ "Show me a bar chart of sales by region" │ -└─────────────────────────────────────────┬───────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ SQL Generation LLM │ -│ Generates SQL + sets visualization_requested: true │ -└─────────────────────────────────────────┬───────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Database Query │ -│ Execute SQL, return result rows │ -└─────────────────────────────────────────┬───────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Visualization LLM │ -│ Generates matplotlib code based on data + user question │ -└─────────────────────────────────────────┬───────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Azure Container Apps Dynamic Sessions │ -│ • Code executed in Hyper-V isolated container │ -│ • plt.show() output captured automatically │ -│ • Image returned as base64 PNG │ -└─────────────────────────────────────────┬───────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Response │ -│ Text explanation + embedded chart image │ -└─────────────────────────────────────────────────────────────────┘ +```mermaid +sequenceDiagram + participant User + participant SQL LLM + participant Database + participant Viz LLM + participant Executor + + User->>SQL LLM: "Show me a bar chart of sales by region" + SQL LLM->>SQL LLM: Generate SQL + set visualization_requested: true + SQL LLM->>Database: Execute SQL query + Database-->>SQL LLM: Result rows + SQL LLM->>Viz LLM: Data + user question + Viz LLM->>Viz LLM: Generate matplotlib code + Viz LLM->>Executor: Execute code + Executor-->>Viz LLM: PNG image (base64) + Viz LLM-->>User: Text response + chart image ``` ### Execution Flow @@ -211,9 +124,3 @@ These prompts trigger visualization: | "Create a pie chart of transaction types" | Pie chart | | "Graph the distribution of order values" | Histogram | | "Compare Q1 vs Q2 performance" | Grouped bar | - -## Further Reading - -- [Azure Container Apps Dynamic Sessions](https://learn.microsoft.com/azure/container-apps/sessions) -- [Session Pool Management](https://learn.microsoft.com/azure/container-apps/sessions-code-interpreter) -- [LangChain Azure Dynamic Sessions](https://python.langchain.com/docs/integrations/tools/azure_dynamic_sessions) diff --git a/src/data_agent/executors/__init__.py b/src/data_agent/executors/__init__.py index 2db9355..cbfb995 100644 --- a/src/data_agent/executors/__init__.py +++ b/src/data_agent/executors/__init__.py @@ -31,10 +31,7 @@ def create_executor() -> CodeExecutor: return AzureSessionsExecutor( pool_management_endpoint=settings.azure_sessions_pool_endpoint, ) - else: - from data_agent.executors.local import LocalExecutor + from data_agent.executors.local import LocalExecutor - logger.info( - "Using local Python REPL executor (development only, no sandboxing)" - ) - return LocalExecutor() + logger.info("Using local Python REPL executor (development only, no sandboxing)") + return LocalExecutor() diff --git a/src/data_agent/nodes/visualization.py b/src/data_agent/nodes/visualization.py index d96b253..5822ec0 100644 --- a/src/data_agent/nodes/visualization.py +++ b/src/data_agent/nodes/visualization.py @@ -60,7 +60,7 @@ async def generate_visualization(self, state: "AgentState") -> dict[str, Any]: messages = [ SystemMessage(content=VISUALIZATION_SYSTEM_PROMPT), HumanMessage( - content=f"""User question: {state['question']} + content=f"""User question: {state["question"]} Data columns: {columns} Data ({len(data)} rows): @@ -107,19 +107,13 @@ async def generate_visualization(self, state: "AgentState") -> dict[str, Any]: ) ], } - else: - output_preview = ( - exec_result.output[:500] if exec_result.output else "" - ) - logger.error( - "Code executed but no image output: %s", output_preview - ) - return { - "visualization_error": f"Code executed but no image: {exec_result.output[:200]}" - } - else: - logger.error("Code execution failed: %s", exec_result.error) - return {"visualization_error": exec_result.error} + output_preview = exec_result.output[:500] if exec_result.output else "" + logger.error("Code executed but no image output: %s", output_preview) + return { + "visualization_error": f"Code executed but no image: {exec_result.output[:200]}" + } + logger.error("Code execution failed: %s", exec_result.error) + return {"visualization_error": exec_result.error} except Exception as e: logger.exception("Visualization code execution failed")