Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,10 @@ python -m dash # CLI mode
|----------|----------|-------------|
| `OPENAI_API_KEY` | Yes | OpenAI API key |
| `EXA_API_KEY` | No | Web search for external knowledge |
| `DB_*` | No | Database config (defaults to localhost) |
| `DB_*` | No | Internal Dash DB config for knowledge/learnings/AgentOS |
| `ANALYTICS_DB_*` | No | Optional analytics DB URLs (`ANALYTICS_DB_<NAME>`) and descriptions (`ANALYTICS_DB_<NAME>_DESC`) |

If no `ANALYTICS_DB_*` variables are set, Dash keeps original single-database behavior and uses the internal DB as analytics source.

## Further Reading

Expand Down
6 changes: 6 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
services:
# Internal database used by Dash for knowledge, learnings, and AgentOS state.
dash-db:
image: agnohq/pgvector:18
container_name: dash-db
Expand Down Expand Up @@ -31,11 +32,16 @@ services:
DATA_DIR: /data
RUNTIME_ENV: dev
AGNO_DEBUG: "True"
# Internal Dash DB (knowledge, learnings, AgentOS)
DB_HOST: dash-db
DB_PORT: 5432
DB_USER: ${DB_USER:-ai}
DB_PASS: ${DB_PASS:-ai}
DB_DATABASE: ${DB_DATABASE:-ai}
# Optional analytics DBs (read-only by credentials):
# ANALYTICS_DB_MAIN=postgresql+psycopg://user:pass@host:5432/dbname
# ANALYTICS_DB_MAIN_DESC=F1 sample data
# If ANALYTICS_DB_* is not set, Dash uses the internal DB as the single analytics source.
WAIT_FOR_DB: "True"
PRINT_ENV_ON_LOAD: "True"
OPENAI_API_KEY: ${OPENAI_API_KEY}
Expand Down
57 changes: 51 additions & 6 deletions dash/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,22 @@
from agno.models.openai import OpenAIResponses
from agno.tools.mcp import MCPTools
from agno.tools.reasoning import ReasoningTools
from agno.tools.sql import SQLTools
from agno.vectordb.pgvector import PgVector, SearchType

from dash.context.business_rules import BUSINESS_CONTEXT
from dash.context.semantic_model import SEMANTIC_MODEL_STR
from dash.tools import create_introspect_schema_tool, create_save_validated_query_tool
from db import db_url, get_postgres_db
from dash.tools import (
create_analytics_sql_tools,
create_introspect_schema_tool,
create_save_validated_query_tool,
)
from db import (
db_url,
get_analytics_descriptions,
get_analytics_registry,
get_postgres_db,
has_explicit_analytics_dbs,
)

# ============================================================================
# Database & Knowledge
Expand Down Expand Up @@ -62,11 +71,16 @@
# Tools
# ============================================================================

analytics_registry = get_analytics_registry()
analytics_descriptions = get_analytics_descriptions()
explicit_analytics_dbs_configured = has_explicit_analytics_dbs()

save_validated_query = create_save_validated_query_tool(dash_knowledge)
introspect_schema = create_introspect_schema_tool(db_url)
analytics_tools = create_analytics_sql_tools(analytics_registry)
introspect_schema = create_introspect_schema_tool(analytics_registry)

base_tools: list = [
SQLTools(db_url=db_url),
*analytics_tools,
save_validated_query,
introspect_schema,
MCPTools(url=f"https://mcp.exa.ai/mcp?exaApiKey={getenv('EXA_API_KEY', '')}&tools=web_search_exa"),
Expand All @@ -76,6 +90,37 @@
# Instructions
# ============================================================================

def _build_databases_section(
registry: dict[str, str], descriptions: dict[str, str]
) -> str:
"""Build explicit analytics database instructions for the prompt."""
lines = ["## AVAILABLE DATABASES", ""]
lines.append("Use these analytics databases for SQL queries:")
lines.append("")

for name in sorted(registry):
description = descriptions.get(name, "")
suffix = f": {description}" if description else ""
lines.append(f"- **{name}**{suffix}")

lines.append("")
if len(registry) > 1:
lines.append(
"- Always pass `database` to list_tables, describe_table, "
"run_sql_query, and introspect_schema."
)
lines.append("- If the request is ambiguous, ask which database to use.")

return "\n".join(lines)


DATABASES_SECTION = (
_build_databases_section(analytics_registry, analytics_descriptions)
if explicit_analytics_dbs_configured
else ""
)
DATABASES_SECTION_BLOCK = f"{DATABASES_SECTION}\n\n---\n\n" if DATABASES_SECTION else ""

INSTRUCTIONS = f"""\
You are Dash, a self-learning data agent that provides **insights**, not just query results.

Expand Down Expand Up @@ -151,7 +196,7 @@

---

## SEMANTIC MODEL
{DATABASES_SECTION_BLOCK}## SEMANTIC MODEL

{SEMANTIC_MODEL_STR}
---
Expand Down
2 changes: 2 additions & 0 deletions dash/context/business_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def build_business_context(business_dir: Path | None = None) -> str:
lines.append("## METRICS\n")
for m in business["metrics"]:
lines.append(f"**{m.get('name', 'Unknown')}**: {m.get('definition', '')}")
if m.get("database"):
lines.append(f" - Database: `{m['database']}`")
if m.get("table"):
lines.append(f" - Table: `{m['table']}`")
if m.get("calculation"):
Expand Down
26 changes: 25 additions & 1 deletion dash/context/semantic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def load_table_metadata(tables_dir: Path | None = None) -> list[dict[str, Any]]:
"description": table.get("table_description", ""),
"use_cases": table.get("use_cases", []),
"data_quality_notes": table.get("data_quality_notes", [])[:MAX_QUALITY_NOTES],
"database": table.get("database"),
}
)
except (json.JSONDecodeError, KeyError, OSError) as e:
Expand All @@ -45,9 +46,32 @@ def build_semantic_model(tables_dir: Path | None = None) -> dict[str, Any]:

def format_semantic_model(model: dict[str, Any]) -> str:
"""Format semantic model for system prompt."""
tables = model.get("tables", [])
if not tables:
return ""

grouped: dict[str | None, list[dict[str, Any]]] = {}
for table in tables:
grouped.setdefault(table.get("database"), []).append(table)

lines: list[str] = []

for table in model.get("tables", []):
for database in sorted(key for key in grouped if key is not None):
lines.append(f"### Database: **{database}**")
lines.append("")
for table in grouped[database]:
lines.append(f"#### {table['table_name']}")
if table.get("description"):
lines.append(table["description"])
if table.get("use_cases"):
lines.append(f"**Use cases:** {', '.join(table['use_cases'])}")
if table.get("data_quality_notes"):
lines.append("**Data quality:**")
for note in table["data_quality_notes"]:
lines.append(f" - {note}")
lines.append("")

for table in grouped.get(None, []):
lines.append(f"### {table['table_name']}")
if table.get("description"):
lines.append(table["description"])
Expand Down
8 changes: 8 additions & 0 deletions dash/knowledge/business/metrics.json
Original file line number Diff line number Diff line change
@@ -1,48 +1,56 @@
{
"metrics": [
{
"database": "main",
"name": "Race Win",
"definition": "A driver finishing in first position in a race",
"table": "race_wins",
"calculation": "COUNT(*) from race_wins grouped by driver name"
},
{
"database": "main",
"name": "World Championship",
"definition": "A driver finishing the season in first position in the drivers championship",
"table": "drivers_championship",
"calculation": "COUNT(*) from drivers_championship WHERE position = '1' (TEXT comparison)"
},
{
"database": "main",
"name": "Constructors Championship",
"definition": "A team finishing the season in first position in the constructors championship",
"table": "constructors_championship",
"calculation": "COUNT(*) from constructors_championship WHERE position = 1 (INTEGER comparison)"
},
{
"database": "main",
"name": "Podium Finish",
"definition": "A driver finishing in positions 1, 2, or 3 in a race",
"table": "race_results",
"calculation": "COUNT(*) from race_results WHERE position IN ('1', '2', '3')"
},
{
"database": "main",
"name": "Fastest Lap",
"definition": "Recording the fastest lap time during a race",
"table": "fastest_laps",
"calculation": "COUNT(*) from fastest_laps grouped by driver name"
},
{
"database": "main",
"name": "DNF (Did Not Finish)",
"definition": "A driver who retired from a race before completion",
"table": "race_results",
"calculation": "COUNT(*) from race_results WHERE position = 'Ret'"
},
{
"database": "main",
"name": "Points Finish",
"definition": "A driver finishing in a points-scoring position",
"table": "race_results",
"calculation": "COUNT(*) from race_results WHERE points > 0"
},
{
"database": "main",
"name": "Championship Points",
"definition": "Total points accumulated over a season",
"table": "drivers_championship or constructors_championship",
Expand Down
1 change: 1 addition & 0 deletions dash/knowledge/tables/constructors_championship.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"database": "main",
"table_name": "constructors_championship",
"table_description": "Contains data for the constructor's championship from 1958 to 2020, capturing championship positions from when it was introduced.",
"use_cases": [
Expand Down
1 change: 1 addition & 0 deletions dash/knowledge/tables/drivers_championship.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"database": "main",
"table_name": "drivers_championship",
"table_description": "Contains data for driver's championship standings from 1950-2020, detailing driver positions, teams, and points.",
"use_cases": [
Expand Down
1 change: 1 addition & 0 deletions dash/knowledge/tables/fastest_laps.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"database": "main",
"table_name": "fastest_laps",
"table_description": "Contains data for the fastest laps recorded in races from 1950-2020, including driver and team details.",
"use_cases": [
Expand Down
1 change: 1 addition & 0 deletions dash/knowledge/tables/race_results.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"database": "main",
"table_name": "race_results",
"table_description": "Holds comprehensive race data for each Formula 1 race from 1950-2020, including positions, drivers, teams, and points.",
"use_cases": [
Expand Down
1 change: 1 addition & 0 deletions dash/knowledge/tables/race_wins.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"database": "main",
"table_name": "race_wins",
"table_description": "Documents race win data from 1950-2020, detailing venue, winner, team, and race duration.",
"use_cases": [
Expand Down
47 changes: 43 additions & 4 deletions dash/scripts/load_data.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
"""
Load F1 Data - Downloads F1 data (1950-2020) and loads into PostgreSQL.
Load F1 Data - Downloads F1 data (1950-2020) and loads into an analytics database.

Usage: python -m dash.scripts.load_data
Usage:
python -m dash.scripts.load_data
python -m dash.scripts.load_data --database main
"""

import argparse
from io import StringIO

import httpx
import pandas as pd
from sqlalchemy import create_engine

from db import db_url
from db import get_analytics_registry

S3_URI = "https://agno-public.s3.amazonaws.com/f1"

Expand All @@ -23,7 +26,43 @@
}

if __name__ == "__main__":
engine = create_engine(db_url)
parser = argparse.ArgumentParser(
description="Load F1 sample data into an analytics database"
)
parser.add_argument(
"--database",
type=str,
default=None,
help=(
"Logical analytics DB name. Required when multiple analytics databases "
"are configured."
),
)
args = parser.parse_args()

registry = get_analytics_registry()
is_single_db = len(registry) == 1

if is_single_db:
db_name = next(iter(registry))
target_url = registry[db_name]
print(f"Target database: {db_name} (single database mode)\n")
else:
if not args.database:
print("Error: Multiple analytics databases configured.")
print("Pass --database with one of: " + ", ".join(sorted(registry)))
raise SystemExit(1)

db_name = args.database.lower()
if db_name not in registry:
print(f"Error: Unknown database '{args.database}'.")
print("Available: " + ", ".join(sorted(registry)))
raise SystemExit(1)

target_url = registry[db_name]
print(f"Target database: {db_name}\n")

engine = create_engine(target_url)
total = 0

for table, url in TABLES.items():
Expand Down
2 changes: 2 additions & 0 deletions dash/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

from dash.tools.introspect import create_introspect_schema_tool
from dash.tools.save_query import create_save_validated_query_tool
from dash.tools.sql import create_analytics_sql_tools

__all__ = [
"create_analytics_sql_tools",
"create_introspect_schema_tool",
"create_save_validated_query_tool",
]
Loading