diff --git a/.github/workflows/evaluate-agent.yml b/.github/workflows/evaluate-agent.yml index aa18aa9..a12bf0c 100644 --- a/.github/workflows/evaluate-agent.yml +++ b/.github/workflows/evaluate-agent.yml @@ -82,4 +82,4 @@ jobs: owner: context.repo.owner, repo: context.repo.repo, body: body - }); + }); \ No newline at end of file diff --git a/docs/02-prompt-management.md b/docs/02-prompt-management.md index 65ee386..fdf84e8 100644 --- a/docs/02-prompt-management.md +++ b/docs/02-prompt-management.md @@ -103,8 +103,8 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. 1. Add the agent configuration to your `.env` file: ``` - AGENT_NAME=trail-guide - MODEL_NAME=gpt-4.1 + AGENT_NAME="trail-guide" + MODEL_NAME="gpt-4.1" ``` ### Install Python dependencies diff --git a/docs/03-design-optimize-prompts.md b/docs/03-design-optimize-prompts.md index 7658cb8..ad4bd9b 100644 --- a/docs/03-design-optimize-prompts.md +++ b/docs/03-design-optimize-prompts.md @@ -127,8 +127,8 @@ With your Azure resources deployed, install the required Python packages. Open the `.env` file in your repository root and add: ``` - AGENT_NAME=trail-guide - MODEL_NAME=gpt-4.1 + AGENT_NAME="trail-guide" + MODEL_NAME="gpt-4.1" ``` ## Understand the experimental workflow @@ -232,7 +232,7 @@ The baseline provides: Create your baseline evaluation scores. -1. Create `experiments/baseline/evaluation.csv`: +1. Check if it created or create `experiments/baseline/evaluation.csv`: ```csv test_prompt,agent_response_excerpt,intent_resolution,relevance,groundedness,comments @@ -404,7 +404,7 @@ Review the agent responses and create an evaluation CSV with quality scores. New-Item experiments/optimized-concise/evaluation.csv ``` -1. Open the file in VS Code and add the CSV header and scores: +1. Open the file in VS Code and verify or add the CSV header and scores: ```csv test_prompt,agent_response_excerpt,intent_resolution,relevance,groundedness,comments diff --git a/docs/04-automated-evaluation.md b/docs/04-automated-evaluation.md index 0bbab31..5ca61e8 100644 --- a/docs/04-automated-evaluation.md +++ b/docs/04-automated-evaluation.md @@ -318,6 +318,16 @@ Execute the complete evaluation pipeline with one command. > **Note**: Evaluation runtime varies based on dataset size and model capacity. 200 items typically takes 5-15 minutes. +1. **Commit the results file** + + The script writes a summary to `evaluation_results.txt` in your project root. Commit this file so the GitHub Actions workflow can read it when it runs on your PR: + + ```powershell + git add evaluation_results.txt + git commit -m "Add evaluation results" + git push + ``` + ### Automate with GitHub Actions The evaluation script integrates seamlessly into GitHub Actions for automated PR evaluations. diff --git a/evaluation_results.txt b/evaluation_results.txt new file mode 100644 index 0000000..c0603f0 --- /dev/null +++ b/evaluation_results.txt @@ -0,0 +1,75 @@ +================================================================================ + Trail Guide Agent - Cloud Evaluation +================================================================================ + +Configuration: + Project: https://ai-account-u2favsrdpp24k.services.ai.azure.com/api/projects/ai-project-dev-tester + Model: gpt-4.1 + Dataset: trail-guide-evaluation-dataset (v1) + +================================================================================ +Step 1: Uploading evaluation dataset +================================================================================ + +Dataset: trail_guide_evaluation_dataset.jsonl +Uploading... + + Dataset version 1 already exists in Foundry. + Retrieving existing dataset ID... + ✓ Using existing dataset + Dataset ID: azureai://accounts/ai-account-u2favsrdpp24k/projects/ai-project-dev-tester/data/trail-guide-evaluation-dataset/versions/1 + +================================================================================ +Step 2: Creating evaluation definition +================================================================================ + +Configuration: + Judge Model: gpt-4.1 + Evaluators: Intent Resolution, Relevance, Groundedness + +Creating evaluation... + +✓ Evaluation definition created + Evaluation ID: eval_7d81e353fce74ab3a51105fac7f59ed8 + +================================================================================ +Step 3: Running cloud evaluation +================================================================================ + +✓ Evaluation run started + Run ID: evalrun_31596933faa44251a62247465b5d8fbd + Status: in_progress + +This may take 5-10 minutes for 200 items... + +================================================================================ +Step 4: Polling for completion +================================================================================ + [3675s] Status: in_progress + +✓ Evaluation completed in 3694 seconds + +================================================================================ +Step 5: Retrieving results +================================================================================ + +Evaluation Summary + Report URL: https://ai.azure.com/nextgen/r/ZDCVYALRRAyvc0GrsS4tgA,rg-dev-tester,,ai-account-u2favsrdpp24k,ai-project-dev-tester/build/evaluations/eval_7d81e353fce74ab3a51105fac7f59ed8/run/evalrun_31596933faa44251a62247465b5d8fbd + + [DEBUG] First item type : + [DEBUG] First item attrs: ['construct', 'copy', 'created_at', 'datasource_item', 'datasource_item_id', 'dict', 'eval_id', 'from_orm', 'id', 'json', 'model_computed_fields', 'model_config', 'model_construct', 'model_copy', 'model_dump', 'model_dump_json', 'model_extra', 'model_fields', 'model_fields_set', 'model_json_schema', 'model_parametrized_name', 'model_post_init', 'model_rebuild', 'model_validate', 'model_validate_json', 'model_validate_strings', 'object', 'parse_file', 'parse_obj', 'parse_raw', 'results', 'run_id', 'sample', 'schema', 'schema_json', 'status', 'to_dict', 'to_json', 'update_forward_refs', 'validate'] + [DEBUG] item.__dict__: {'id': '1', 'created_at': 1772201992, 'datasource_item': {'query': 'What essential gear do I need for a summer day hike?', 'response': 'For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.', 'ground_truth': 'Essential day hike gear includes footwear, water, food, sun protection, navigation tools, first aid, and emergency supplies.'}, 'datasource_item_id': 0, 'eval_id': 'eval_7d81e353fce74ab3a51105fac7f59ed8', 'object': 'eval.run.output_item', 'results': [Result(name='intent_resolution', passed=True, score=5.0, sample={'usage': {'prompt_tokens': 1984, 'completion_tokens': 61, 'total_tokens': 2045}, 'finish_reason': 'stop', 'model': 'gpt-4.1-2025-04-14', 'input': [{'role': 'user', 'content': '{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.", "tool_definitions": null}'}], 'output': [{'role': 'assistant', 'content': '{\n "explanation": "The user asked for essential gear for a summer day hike. The agent provided a thorough, accurate, and relevant list, including safety, hydration, navigation, and sun protection, fully resolving the user\'s intent with no notable omissions.",\n "score": 5\n}'}]}, type='azure_ai_evaluator', metric='intent_resolution', label='pass', reason="The user asked for essential gear for a summer day hike. The agent provided a thorough, accurate, and relevant list, including safety, hydration, navigation, and sun protection, fully resolving the user's intent with no notable omissions.", threshold=3), Result(name='relevance', passed=True, score=5.0, sample={'usage': {'prompt_tokens': 1678, 'completion_tokens': 63, 'total_tokens': 1741}, 'finish_reason': 'stop', 'model': 'gpt-4.1-2025-04-14', 'input': [{'role': 'user', 'content': '{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out."}'}], 'output': [{'role': 'assistant', 'content': '{\n "explanation": "The response provides a thorough list of essential gear for a summer day hike, including clothing, hydration, navigation, safety, and sun protection. It also adds practical advice about checking weather and trail conditions, offering both completeness and useful context.",\n "score": 5\n}'}]}, type='azure_ai_evaluator', metric='relevance', label='pass', reason='The response provides a thorough list of essential gear for a summer day hike, including clothing, hydration, navigation, safety, and sun protection. It also adds practical advice about checking weather and trail conditions, offering both completeness and useful context.', threshold=3), Result(name='groundedness', passed=True, score=5.0, sample={'usage': {'prompt_tokens': 1571, 'completion_tokens': 106, 'total_tokens': 1677}, 'finish_reason': 'stop', 'model': 'gpt-4.1-2025-04-14', 'input': [{'role': 'user', 'content': '{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.", "context": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out."}'}], 'output': [{'role': 'assistant', 'content': "Let's think step by step: The query asks for the essential gear needed for a summer day hike. The context provides a detailed list of essential gear and some additional advice. The response repeats the context almost verbatim, listing all the gear and advice without omitting or adding any information. There are no inaccuracies or missing details.\nThe response is fully accurate, complete, and directly grounded in the provided context, addressing the query thoroughly.\n5"}]}, type='azure_ai_evaluator', metric='groundedness', label='pass', reason='The response is fully accurate, complete, and directly grounded in the provided context, addressing the query thoroughly.', threshold=3)], 'run_id': 'evalrun_31596933faa44251a62247465b5d8fbd', 'sample': Sample(error=None, finish_reason='stop', input=[SampleInput(content='{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.", "context": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out."}', role='user')], max_completion_tokens=None, model='gpt-4.1-2025-04-14', output=[SampleOutput(content="Let's think step by step: The query asks for the essential gear needed for a summer day hike. The context provides a detailed list of essential gear and some additional advice. The response repeats the context almost verbatim, listing all the gear and advice without omitting or adding any information. There are no inaccuracies or missing details.\nThe response is fully accurate, complete, and directly grounded in the provided context, addressing the query thoroughly.\n5", role='assistant')], seed=None, temperature=None, top_p=None, usage=SampleUsage(cached_tokens=None, completion_tokens=106, prompt_tokens=1571, total_tokens=1677)), 'status': 'completed'} +================================================================================ + Trail Guide Agent - Evaluation Results +================================================================================ + + Report URL : https://ai.azure.com/nextgen/r/ZDCVYALRRAyvc0GrsS4tgA,rg-dev-tester,,ai-account-u2favsrdpp24k,ai-project-dev-tester/build/evaluations/eval_7d81e353fce74ab3a51105fac7f59ed8/run/eva + +================================================================================ +Cloud evaluation complete +================================================================================ + +Next steps: + 1. Review detailed results in Azure AI Foundry portal + 2. Analyze patterns in successful and failed evaluations + 3. Commit evaluation_results.txt and push so the PR workflow can use it \ No newline at end of file diff --git a/experiments/baseline/evaluation.csv b/experiments/baseline/evaluation.csv deleted file mode 100644 index a45bad5..0000000 --- a/experiments/baseline/evaluation.csv +++ /dev/null @@ -1,6 +0,0 @@ -test_prompt,agent_response_excerpt,intent_resolution,relevance,groundedness,comments -day-hike-gear,"For a summer day hike in moderate terrain, essential gear includes...",5,5,4,Clear and comprehensive gear list with good detail -overnight-camping,"For your first overnight camping trip, start with essential shelter...",4,4,5,Good beginner advice with appropriate safety focus -three-day-backpacking,"Critical safety considerations for October mountain backpacking include...",5,5,5,Excellent detailed safety guidance for extended trips -winter-hiking,"Winter hiking requires significantly more gear and precautions than summer...",4,4,4,Good comparison of seasonal differences -trail-difficulty,"To assess trail difficulty for your fitness level as a beginner...",5,5,5,Helpful framework for self-assessment with practical tips diff --git a/experiments/gpt41mini/evaluation.csv b/experiments/gpt41mini/evaluation.csv deleted file mode 100644 index f10be2f..0000000 --- a/experiments/gpt41mini/evaluation.csv +++ /dev/null @@ -1,6 +0,0 @@ -test_prompt,agent_response_excerpt,intent_resolution,relevance,groundedness,depth,comments -day-hike-gear,"For summer day hikes on moderate terrain you'll need...",4,4,4,3,Good coverage of essentials but less detailed than GPT-4 -overnight-camping,"First overnight camping essentials include a tent...",4,4,4,3,Solid advice with key items covered - slightly less nuanced -three-day-backpacking,"October mountain safety priorities include weather monitoring...",4,4,4,4,Good safety guidance with appropriate level of detail -winter-hiking,"Winter hiking requires additional insulation layers and safety gear...",4,4,4,3,Clear comparison of seasonal differences with adequate detail -trail-difficulty,"To assess trail difficulty as a beginner consider distance and elevation...",4,4,4,4,Practical framework provided with actionable criteria diff --git a/experiments/optimized-concise/evaluation.csv b/experiments/optimized-concise/evaluation.csv deleted file mode 100644 index 60ba956..0000000 --- a/experiments/optimized-concise/evaluation.csv +++ /dev/null @@ -1,6 +0,0 @@ -test_prompt,agent_response_excerpt,intent_resolution,relevance,groundedness,token_reduction,comments -day-hike-gear,"Essential gear for summer moderate terrain day hikes includes...",5,5,4,45%,Maintains quality with significantly reduced verbosity -overnight-camping,"First overnight camping essentials: shelter, sleeping, cooking...",4,5,5,42%,More direct communication while preserving safety focus -three-day-backpacking,"October mountain backpacking safety priorities: weather prep, gear redundancy...",5,5,5,38%,Excellent concise safety guidance without losing detail -winter-hiking,"Winter additions: insulation layers, backup heat, avalanche gear...",5,4,4,47%,Clear comparison without excess explanation -trail-difficulty,"Beginner trail difficulty assessment factors: distance, elevation gain, technical skill...",5,5,5,40%,Helpful and immediately actionable guidance diff --git a/infra/core/ai/ai-project.bicep b/infra/core/ai/ai-project.bicep index f41f4ee..c309fa9 100644 --- a/infra/core/ai/ai-project.bicep +++ b/infra/core/ai/ai-project.bicep @@ -76,67 +76,50 @@ module applicationInsights '../monitor/applicationinsights.bicep' = if (enableMo resource aiAccount 'Microsoft.CognitiveServices/accounts@2025-06-01' = { name: !empty(existingAiAccountName) ? existingAiAccountName : 'ai-account-${resourceToken}' location: location - tags: tags - sku: { - name: 'S0' - } kind: 'AIServices' - identity: { - type: 'SystemAssigned' - } + sku: { name: 'S0' } + identity: { type: 'SystemAssigned' } properties: { allowProjectManagement: true customSubDomainName: !empty(existingAiAccountName) ? existingAiAccountName : 'ai-account-${resourceToken}' + publicNetworkAccess: 'Enabled' + disableLocalAuth: true networkAcls: { defaultAction: 'Allow' virtualNetworkRules: [] ipRules: [] } - publicNetworkAccess: 'Enabled' - disableLocalAuth: true } - - @batchSize(1) - resource seqDeployments 'deployments' = [ - for dep in (deployments ?? []): { - name: dep.name - sku: dep.sku - properties: { - model: dep.model - } - } - ] +} - resource project 'projects' = { - name: aiFoundryProjectName - location: location - identity: { - type: 'SystemAssigned' - } - properties: { - description: '${aiFoundryProjectName} Project' - displayName: '${aiFoundryProjectName}Project' - } - dependsOn: [ - seqDeployments - ] +// NEW: separate project resource, matching docs +resource project 'Microsoft.CognitiveServices/accounts/projects@2025-06-01' = { + name: aiFoundryProjectName + parent: aiAccount + location: location + identity: { + type: 'SystemAssigned' } - - resource aiFoundryAccountCapabilityHost 'capabilityHosts@2025-10-01-preview' = if (enableHostedAgents) { - name: 'agents' - properties: { - capabilityHostKind: 'Agents' - // IMPORTANT: this is required to enable hosted agents deployment - // if no BYO Net is provided - enablePublicHostingEnvironment: true - } + properties: { + description: '${aiFoundryProjectName} Project' + displayName: '${aiFoundryProjectName}Project' } } +resource aiFoundryAccountCapabilityHost 'Microsoft.CognitiveServices/accounts/capabilityHosts@2025-10-01-preview' = if (enableHostedAgents) { + name: 'agents' + parent: aiAccount + properties: { + capabilityHostKind: 'Agents' + // IMPORTANT: this is required to enable hosted agents deployment + // if no BYO Net is provided + enablePublicHostingEnvironment: true + } +} // Create connection towards appinsights -resource appInsightConnection 'Microsoft.CognitiveServices/accounts/projects/connections@2025-04-01-preview' = { - parent: aiAccount::project +resource appInsightConnection 'Microsoft.CognitiveServices/accounts/projects/connections@2025-04-01-preview' = if (enableMonitoring) { + parent: project name: 'appi-connection' properties: { category: 'AppInsights' @@ -158,7 +141,7 @@ module aiConnections './connection.bicep' = [for (connection, index) in connecti name: 'connection-${connection.name}' params: { aiServicesAccountName: aiAccount.name - aiProjectName: aiAccount::project.name + aiProjectName: project.name connectionConfig: { name: connection.name category: connection.category @@ -191,9 +174,9 @@ resource localUserCognitiveServicesUserRoleAssignment 'Microsoft.Authorization/r resource projectCognitiveServicesUserRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { scope: aiAccount - name: guid(subscription().id, resourceGroup().id, aiAccount::project.name, '53ca6127-db72-4b80-b1b0-d745d6d5456d') + name: guid(subscription().id, resourceGroup().id, project.name, '53ca6127-db72-4b80-b1b0-d745d6d5456d') properties: { - principalId: aiAccount::project.identity.principalId + principalId: project.identity.principalId principalType: 'ServicePrincipal' roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', '53ca6127-db72-4b80-b1b0-d745d6d5456d') } @@ -214,7 +197,7 @@ module storage '../storage/storage.bicep' = if (hasStorageConnection) { principalId: principalId principalType: principalType aiServicesAccountName: aiAccount.name - aiProjectName: aiAccount::project.name + aiProjectName: project.name } } @@ -229,7 +212,7 @@ module acr '../host/acr.bicep' = if (hasAcrConnection) { principalId: principalId principalType: principalType aiServicesAccountName: aiAccount.name - aiProjectName: aiAccount::project.name + aiProjectName: project.name } } @@ -241,7 +224,7 @@ module bingGrounding '../search/bing_grounding.bicep' = if (hasBingConnection) { resourceName: 'bing-${resourceToken}' connectionName: bingConnectionName aiServicesAccountName: aiAccount.name - aiProjectName: aiAccount::project.name + aiProjectName: project.name } } @@ -253,7 +236,7 @@ module bingCustomGrounding '../search/bing_custom_grounding.bicep' = if (hasBing resourceName: 'bingcustom-${resourceToken}' connectionName: bingCustomConnectionName aiServicesAccountName: aiAccount.name - aiProjectName: aiAccount::project.name + aiProjectName: project.name } } @@ -267,7 +250,7 @@ module azureAiSearch '../search/azure_ai_search.bicep' = if (hasSearchConnection storageAccountResourceId: hasStorageConnection ? storage!.outputs.storageAccountId : '' containerName: 'knowledge' aiServicesAccountName: aiAccount.name - aiProjectName: aiAccount::project.name + aiProjectName: project.name principalId: principalId principalType: principalType location: location @@ -275,16 +258,38 @@ module azureAiSearch '../search/azure_ai_search.bicep' = if (hasSearchConnection } +// Deploy model deployments on the AI account. +// Each entry in the `deployments` parameter becomes a real deployment that +// can be referenced by name from the project (e.g. "gpt-4.1", "gpt-4.1-mini"). +@batchSize(1) // Deploy one at a time to avoid capacity conflicts +resource modelDeployments 'Microsoft.CognitiveServices/accounts/deployments@2025-06-01' = [ + for deployment in (deployments ?? []): { + parent: aiAccount + name: deployment.name + sku: { + name: deployment.sku.name + capacity: deployment.sku.capacity + } + properties: { + model: { + format: deployment.model.format + name: deployment.model.name + version: deployment.model.?version ?? null + } + } + } +] + // Outputs -output AZURE_AI_PROJECT_ENDPOINT string = aiAccount::project.properties.endpoints['AI Foundry API'] +output AZURE_AI_PROJECT_ENDPOINT string = project.properties.endpoints['AI Foundry API'] output AZURE_OPENAI_ENDPOINT string = aiAccount.properties.endpoints['OpenAI Language Model Instance API'] output aiServicesEndpoint string = aiAccount.properties.endpoint output accountId string = aiAccount.id -output projectId string = aiAccount::project.id +output projectId string = project.id output aiServicesAccountName string = aiAccount.name -output aiServicesProjectName string = aiAccount::project.name +output aiServicesProjectName string = project.name output aiServicesPrincipalId string = aiAccount.identity.principalId -output projectName string = aiAccount::project.name +output projectName string = project.name output APPLICATIONINSIGHTS_CONNECTION_STRING string = applicationInsights.outputs.connectionString // Grouped dependent resources outputs diff --git a/infra/main.parameters.json b/infra/main.parameters.json index de56a9d..f4b69dd 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -36,7 +36,7 @@ "value": "${ENABLE_MONITORING=true}" }, "enableHostedAgents": { - "value": "${ENABLE_HOSTED_AGENTS=false}" + "value": "${ENABLE_HOSTED_AGENTS=true}" } } } diff --git a/src/agents/trail_guide_agent/trail_guide_agent.py b/src/agents/trail_guide_agent/trail_guide_agent.py index 04a951c..c140e07 100644 --- a/src/agents/trail_guide_agent/trail_guide_agent.py +++ b/src/agents/trail_guide_agent/trail_guide_agent.py @@ -5,13 +5,11 @@ from azure.ai.projects import AIProjectClient from azure.ai.projects.models import PromptAgentDefinition -# Load environment variables from repository root -repo_root = Path(__file__).parent.parent.parent.parent -env_file = repo_root / '.env' -load_dotenv(env_file) +# Load environment variables from .env file +load_dotenv() # Read instructions from prompt file -prompt_file = Path(__file__).parent / 'prompts' / 'v2_instructions.txt' +prompt_file = Path(__file__).parent / 'prompts' / 'v4_optimized_concise.txt' with open(prompt_file, 'r') as f: instructions = f.read().strip() diff --git a/src/evaluators/evaluate_agent.py b/src/evaluators/evaluate_agent.py index d112992..bfeffb7 100644 --- a/src/evaluators/evaluate_agent.py +++ b/src/evaluators/evaluate_agent.py @@ -23,86 +23,151 @@ SourceFileID, ) -# Load environment variables -load_dotenv() -endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT") +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +load_dotenv() # reads variables from the .env file in your project root + +endpoint = os.environ.get("AZURE_AI_PROJECT_ENDPOINT") model_deployment_name = os.environ.get("MODEL_NAME", "gpt-4.1") -dataset_name = "trail-guide-evaluation-dataset" -dataset_version = "1" +dataset_name = "trail-guide-evaluation-dataset" +dataset_version = "1" + +# The script writes a plain-text summary here when it finishes. +# This file is committed to the branch so the GitHub Actions workflow +# can read it and post results as a PR comment — no re-running needed. +RESULTS_FILE = Path("evaluation_results.txt") if not endpoint: - print("Error: AZURE_AI_PROJECT_ENDPOINT environment variable not set") + print("ERROR: AZURE_AI_PROJECT_ENDPOINT is not set.") + print(" Add it to your .env file and try again.") sys.exit(1) -# Initialize project client +# --------------------------------------------------------------------------- +# Azure clients +# --------------------------------------------------------------------------- + +# AIProjectClient connects to your Azure AI Foundry project project_client = AIProjectClient( endpoint=endpoint, credential=DefaultAzureCredential(), ) -# Get OpenAI client for evaluation API +# The OpenAI-compatible client exposes the Evals API client = project_client.get_openai_client() -def upload_dataset(): - """Upload the evaluation dataset to Foundry.""" - print("\n" + "=" * 80) - print("Step 1: Uploading evaluation dataset") - print("=" * 80) - - dataset_path = Path(__file__).parent.parent.parent / "data" / "trail_guide_evaluation_dataset.jsonl" - + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + +def section(title: str) -> None: + """Print a clearly visible section header to make the log easy to skim.""" + print(f"\n{'=' * 80}") + print(f"{title}") + print(f"{'=' * 80}") + + +# --------------------------------------------------------------------------- +# Step 1 – Upload the evaluation dataset +# --------------------------------------------------------------------------- + +def upload_dataset() -> str: + """ + Upload the JSONL evaluation dataset to Azure AI Foundry and return its ID. + + The dataset is a list of JSON objects, each containing: + - query : the user question sent to the agent + - response : the agent's answer + - ground_truth : the expected correct answer (used by some evaluators) + """ + section("Step 1: Uploading evaluation dataset") + + dataset_path = ( + Path(__file__).parent.parent.parent + / "data" + / "trail_guide_evaluation_dataset.jsonl" + ) + if not dataset_path.exists(): - raise FileNotFoundError(f"Dataset not found at {dataset_path}") - + raise FileNotFoundError( + f"Dataset not found at {dataset_path}.\n" + "Make sure you are running the script from the repository root." + ) + print(f"\nDataset: {dataset_path.name}") - print(f"Uploading...") - - data_id = project_client.datasets.upload_file( - name=dataset_name, - version=dataset_version, - file_path=str(dataset_path), - ).id - - print(f"\n✓ Dataset uploaded successfully") + print("Uploading...") + + try: + data_id = project_client.datasets.upload_file( + name=dataset_name, + version=dataset_version, + file_path=str(dataset_path), + ).id + print(f"\n✓ Dataset uploaded successfully") + + except Exception as upload_error: + # If this version was already uploaded in a previous run, reuse it. + # Foundry does not allow uploading the same name+version twice. + if "already exists" in str(upload_error): + print(f"\n Dataset version {dataset_version} already exists in Foundry.") + print(f" Retrieving existing dataset ID...") + dataset_obj = project_client.datasets.get(name=dataset_name, version=dataset_version) + data_id = dataset_obj.id + print(f" ✓ Using existing dataset") + else: + # Any other upload error is unexpected — re-raise so main() catches it + raise + print(f" Dataset ID: {data_id}") - return data_id + +# --------------------------------------------------------------------------- +# Step 2 – Create the evaluation definition +# --------------------------------------------------------------------------- + def create_evaluation_definition(): - """Create the evaluation definition with quality evaluators.""" - print("\n" + "=" * 80) - print("Step 2: Creating evaluation definition") - print("=" * 80) - + """ + Register an evaluation definition in Foundry. + + This tells the platform: + - What data schema to expect (query / response / ground_truth) + - Which built-in evaluators to run and how to map dataset fields to them + + Returns the evaluation object (its ID is needed in later steps). + """ + section("Step 2: Creating evaluation definition") + print(f"\nConfiguration:") print(f" Judge Model: {model_deployment_name}") print(f" Evaluators: Intent Resolution, Relevance, Groundedness") - - # Define data schema (what fields are in our JSONL) + + # Tell Foundry the shape of each record in the dataset data_source_config = DataSourceConfigCustom( type="custom", item_schema={ "type": "object", "properties": { - "query": {"type": "string"}, - "response": {"type": "string"}, + "query": {"type": "string"}, + "response": {"type": "string"}, "ground_truth": {"type": "string"}, }, "required": ["query", "response", "ground_truth"], }, ) - - # Define testing criteria (evaluators) + + # Each entry names a built-in evaluator and maps dataset columns to its + # expected parameters using the {{item.}} template syntax. testing_criteria = [ { "type": "azure_ai_evaluator", "name": "intent_resolution", "evaluator_name": "builtin.intent_resolution", - "initialization_parameters": { - "deployment_name": model_deployment_name, - }, + "initialization_parameters": {"deployment_name": model_deployment_name}, "data_mapping": { - "query": "{{item.query}}", + "query": "{{item.query}}", "response": "{{item.response}}", }, }, @@ -110,11 +175,9 @@ def create_evaluation_definition(): "type": "azure_ai_evaluator", "name": "relevance", "evaluator_name": "builtin.relevance", - "initialization_parameters": { - "deployment_name": model_deployment_name, - }, + "initialization_parameters": {"deployment_name": model_deployment_name}, "data_mapping": { - "query": "{{item.query}}", + "query": "{{item.query}}", "response": "{{item.response}}", }, }, @@ -122,35 +185,39 @@ def create_evaluation_definition(): "type": "azure_ai_evaluator", "name": "groundedness", "evaluator_name": "builtin.groundedness", - "initialization_parameters": { - "deployment_name": model_deployment_name, - }, + "initialization_parameters": {"deployment_name": model_deployment_name}, "data_mapping": { - "query": "{{item.query}}", + "query": "{{item.query}}", "response": "{{item.response}}", }, }, ] - - # Create the evaluation definition + print("\nCreating evaluation...") eval_object = client.evals.create( name="Trail Guide Quality Evaluation", data_source_config=data_source_config, testing_criteria=testing_criteria, ) - + print(f"\n✓ Evaluation definition created") print(f" Evaluation ID: {eval_object.id}") - return eval_object + +# --------------------------------------------------------------------------- +# Step 3 – Start the evaluation run +# --------------------------------------------------------------------------- + def run_evaluation(eval_object, data_id): - """Run the evaluation against the uploaded dataset.""" - print("\n" + "=" * 80) - print("Step 3: Running cloud evaluation") - print("=" * 80) - + """ + Start a cloud evaluation run that scores every record in the dataset. + + The run is asynchronous — Foundry processes items in parallel in the cloud. + We get a run object immediately; its status starts as 'queued' or 'running'. + """ + section("Step 3: Running cloud evaluation") + eval_run = client.evals.runs.create( eval_id=eval_object.id, name="trail-guide-baseline-eval", @@ -162,148 +229,207 @@ def run_evaluation(eval_object, data_id): ), ), ) - + print(f"\n✓ Evaluation run started") print(f" Run ID: {eval_run.id}") print(f" Status: {eval_run.status}") print(f"\nThis may take 5-10 minutes for 200 items...") - return eval_run + +# --------------------------------------------------------------------------- +# Step 4 – Poll until the run finishes +# --------------------------------------------------------------------------- + def poll_for_results(eval_object, eval_run): - """Poll the evaluation run until complete.""" - print("\n" + "=" * 80) - print("Step 4: Polling for completion") - print("=" * 80) - + """ + Repeatedly check the run status every 10 seconds until it is 'completed'. + + Returns the final run object (which contains the report URL and results). + Exits with code 1 if the run fails so CI pipelines surface the error. + """ + section("Step 4: Polling for completion") + start_time = time.time() while True: run = client.evals.runs.retrieve( run_id=eval_run.id, - eval_id=eval_object.id + eval_id=eval_object.id, ) - + elapsed = int(time.time() - start_time) - status_msg = f" [{elapsed}s] Status: {run.status}" - + if run.status == "completed": - print(f"\n\n✓ Evaluation completed successfully") - print(f" Total time: {elapsed} seconds") + print(f"\n\n✓ Evaluation completed in {elapsed} seconds") break elif run.status == "failed": - print(f"\n\n✗ Evaluation failed") - print(f" Check Azure portal for details") - sys.exit(1) + # Raise so main() catches it, writes RESULTS_FILE, then exits + error_detail = getattr(run, "error", None) or "No additional details available." + raise RuntimeError( + f"Evaluation run failed after {elapsed}s.\n" + f" Eval ID : {eval_object.id}\n" + f" Run ID : {eval_run.id}\n" + f" Error : {error_detail}\n" + f" To inspect: open Azure AI Foundry portal > Evaluations" + ) else: - print(f"{status_msg}", end="\r", flush=True) + # Overwrite the same line so the terminal isn't flooded + print(f" [{elapsed}s] Status: {run.status}", end="\r", flush=True) time.sleep(10) - + return run + +# --------------------------------------------------------------------------- +# Step 5 – Collect scores and save results +# --------------------------------------------------------------------------- + def retrieve_and_display_results(eval_object, run): - """Retrieve evaluation results and display summary.""" - print("\n" + "=" * 80) - print("Step 5: Retrieving results") - print("=" * 80) - - # Get aggregate results + """ + Fetch per-item evaluator outputs, compute aggregate statistics, print a + human-readable summary, and write the same summary to RESULTS_FILE. + + Scores are on a 1-5 scale; a score >= 3 is considered a pass. + + The written file is intended to be committed to the branch so the + GitHub Actions workflow can read it without re-running the evaluation. + + Returns the raw list of output items for any further inspection. + """ + section("Step 5: Retrieving results") + print(f"\nEvaluation Summary") - print(f" Report URL: {run.report_url}") - - # Retrieve detailed output items + + # Retrieve every scored item from the run output_items = list( client.evals.runs.output_items.list( run_id=run.id, - eval_id=eval_object.id + eval_id=eval_object.id, ) ) - - # Calculate statistics - intent_resolution_scores = [] - relevance_scores = [] - groundedness_scores = [] - - for item in output_items: - if hasattr(item, 'evaluator_outputs'): + + # Separate items by status so we can report errors alongside scores + errored_items = [ + item for item in output_items + if getattr(item, "status", None) == "error" + ] + scored_items = [ + item for item in output_items + if getattr(item, "status", None) != "error" + ] + + if errored_items: + print(f"\n ⚠ {len(errored_items)} item(s) errored during evaluation.") + print(f" First error: {getattr(errored_items[0], 'error', 'details unavailable')}") + print(f" Open Azure AI Foundry portal > Evaluations to inspect all failed items.") + + # Collect individual scores grouped by metric name + scores: dict[str, list[float]] = { + "intent_resolution": [], + "relevance": [], + "groundedness": [], + } + + for item in scored_items: + if hasattr(item, "evaluator_outputs"): for output in item.evaluator_outputs: - if output.name == "intent_resolution" and hasattr(output, 'score'): - intent_resolution_scores.append(output.score) - elif output.name == "relevance" and hasattr(output, 'score'): - relevance_scores.append(output.score) - elif output.name == "groundedness" and hasattr(output, 'score'): - groundedness_scores.append(output.score) - - # Display averages - print(f"\nAverage Scores (1-5 scale, threshold: 3)") - if intent_resolution_scores: - avg_intent = sum(intent_resolution_scores) / len(intent_resolution_scores) - print(f" Intent Resolution: {avg_intent:.2f} (n={len(intent_resolution_scores)})") - - if relevance_scores: - avg_relevance = sum(relevance_scores) / len(relevance_scores) - print(f" Relevance: {avg_relevance:.2f} (n={len(relevance_scores)})") - - if groundedness_scores: - avg_groundedness = sum(groundedness_scores) / len(groundedness_scores) - print(f" Groundedness: {avg_groundedness:.2f} (n={len(groundedness_scores)})") - - # Calculate pass rates - if intent_resolution_scores: - intent_pass_rate = sum(1 for s in intent_resolution_scores if s >= 3) / len(intent_resolution_scores) * 100 - print(f"\nPass Rates (score >= 3)") - print(f" Intent Resolution: {intent_pass_rate:.1f}%") - - if relevance_scores: - relevance_pass_rate = sum(1 for s in relevance_scores if s >= 3) / len(relevance_scores) * 100 - print(f" Relevance: {relevance_pass_rate:.1f}%") - - if groundedness_scores: - groundedness_pass_rate = sum(1 for s in groundedness_scores if s >= 3) / len(groundedness_scores) * 100 - print(f" Groundedness: {groundedness_pass_rate:.1f}%") - + if output.name in scores and hasattr(output, "score"): + scores[output.name].append(output.score) + + # --- Build summary text (printed to console and written to file) --- + # Everything written to `lines` ends up both on screen and in the file, + # so the file always has useful content regardless of whether scores loaded. + + metric_labels = { + "intent_resolution": "Intent Resolution", + "relevance": "Relevance ", + "groundedness": "Groundedness ", + } + + lines = [ + "=" * 80, + " Trail Guide Agent - Evaluation Results", + "=" * 80, + f"\n Eval ID : {eval_object.id}", + f" Run ID : {run.id}", + f" Total items : {len(output_items)}", + f" Errored items: {len(errored_items)}", + f" Scored items : {len(scored_items)}", + "\nAverage Scores (1-5 scale, threshold: 3)", + ] + + any_scores = False + pass_lines = ["\nPass Rates (score >= 3)"] + + for key, label in metric_labels.items(): + values = scores[key] + if values: + any_scores = True + avg = sum(values) / len(values) + rate = sum(1 for v in values if v >= 3) / len(values) * 100 + lines.append(f" {label}: {avg:.2f} (n={len(values)})") + pass_lines.append(f" {label}: {rate:.1f}%") + + if not any_scores: + # Scores missing — the evaluation may have completed but returned no + # evaluator_outputs. Open the Report URL above to inspect in the portal. + lines.append(" No scores returned — open Azure AI Foundry portal > Evaluations for details.") + pass_lines.append(" No scores returned.") + + lines.extend(pass_lines) + summary = "\n".join(lines) + + print(summary) + + # Write to file so the GitHub Actions 'comment' job can read it directly + RESULTS_FILE.write_text(summary, encoding="utf-8") + print(f"\n Results saved to {RESULTS_FILE}") + print(f" Commit this file so the GitHub Actions workflow can read it.") + return output_items -def main(): - """Main execution flow.""" - print("\n" + "=" * 80) - print(" Trail Guide Agent - Cloud Evaluation") - print("=" * 80) + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + """Orchestrate the full evaluation pipeline step by step.""" + section(" Trail Guide Agent - Cloud Evaluation") print(f"\nConfiguration:") print(f" Project: {endpoint}") - print(f" Model: {model_deployment_name}") + print(f" Model: {model_deployment_name}") print(f" Dataset: {dataset_name} (v{dataset_version})") - + try: - # Step 1: Upload dataset - data_id = upload_dataset() - - # Step 2: Create evaluation definition - eval_object = create_evaluation_definition() - - # Step 3: Run evaluation - eval_run = run_evaluation(eval_object, data_id) - - # Step 4: Poll for results - run = poll_for_results(eval_object, eval_run) - - # Step 5: Retrieve and display results - output_items = retrieve_and_display_results(eval_object, run) - - print("\n" + "=" * 80) - print("Cloud evaluation complete") - print("=" * 80) + data_id = upload_dataset() # Step 1 + eval_object = create_evaluation_definition() # Step 2 + eval_run = run_evaluation(eval_object, data_id) # Step 3 + run = poll_for_results(eval_object, eval_run) # Step 4 + retrieve_and_display_results(eval_object, run) # Step 5 + + section("Cloud evaluation complete") print(f"\nNext steps:") print(f" 1. Review detailed results in Azure AI Foundry portal") print(f" 2. Analyze patterns in successful and failed evaluations") - print(f" 3. Document key findings and recommendations") - + print(f" 3. Commit {RESULTS_FILE} and push so the PR workflow can use it") + except Exception as e: - print(f"\nError: {e}") - print(f"\nTroubleshooting:") - print(f" - Verify AZURE_AI_PROJECT_ENDPOINT in .env file") - print(f" - Check Azure credentials: az login") - print(f" - Ensure GPT-4.1 model is deployed and accessible") + error_message = ( + f"{'=' * 80}\n" + f" Trail Guide Agent - Evaluation FAILED\n" + f"{'=' * 80}\n" + f"\nError: {e}\n" + f"\nTroubleshooting:\n" + f" - Verify AZURE_AI_PROJECT_ENDPOINT in .env file\n" + f" - Check Azure credentials: az login\n" + f" - Ensure GPT-4.1 model is deployed and accessible\n" + ) + print(error_message) + # Write the error to the results file so it's never left empty + RESULTS_FILE.write_text(error_message, encoding="utf-8") sys.exit(1) + if __name__ == "__main__": main() diff --git a/src/tests/interact_with_agent.py b/src/tests/interact_with_agent.py index 3a2000f..2cc900f 100644 --- a/src/tests/interact_with_agent.py +++ b/src/tests/interact_with_agent.py @@ -9,10 +9,8 @@ from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -# Load environment variables from repository root -repo_root = Path(__file__).parent.parent.parent -env_file = repo_root / '.env' -load_dotenv(env_file) +# Load environment variables from .env file +load_dotenv() def interact_with_agent(): """Start an interactive chat session with the Trail Guide Agent.""" diff --git a/src/tests/run_batch_tests.py b/src/tests/run_batch_tests.py index 045d280..51f61e1 100644 --- a/src/tests/run_batch_tests.py +++ b/src/tests/run_batch_tests.py @@ -16,10 +16,8 @@ from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -# Load environment variables from repository root -repo_root = Path(__file__).parent.parent.parent -env_file = repo_root / '.env' -load_dotenv(env_file) +# Load environment variables from .env file +load_dotenv() def load_test_prompts(test_prompts_dir): """Load all test prompt files from the test-prompts directory.""" @@ -53,12 +51,14 @@ def run_batch_tests(experiment_name): endpoint=os.environ["AZURE_AI_PROJECT_ENDPOINT"], credential=DefaultAzureCredential(), ) + + openai_client = client.get_openai_client() # Get the agent by name (assumes trail_guide_agent.py already created it) agent_name = os.environ.get("AGENT_NAME", "trail-guide") # List agents and find the one with our name - agents = client.agents.list_agents() + agents = client.agents.list() agent = None for a in agents: if a.name == agent_name: @@ -70,10 +70,7 @@ def run_batch_tests(experiment_name): print("Please run 'python src/agents/trail_guide_agent/trail_guide_agent.py' first to create the agent.") return - print(f"Using agent: {agent.name} (id: {agent.id}, version: {agent.version})") - - # Create thread for this test run - thread = client.agents.create_thread() + print(f"Using agent: {agent.name} (id: {agent.id}, version: {agent.versions})") # Capture all results results = { @@ -81,8 +78,8 @@ def run_batch_tests(experiment_name): "timestamp": datetime.now().isoformat(), "agent_id": agent.id, "agent_name": agent.name, - "agent_version": agent.version, - "model": agent.model, + "agent_version": agent.versions.as_dict() if hasattr(agent.versions, "as_dict") else str(agent.versions), + "agent_object": getattr(agent, "object", None), "test_results": [] } @@ -90,40 +87,49 @@ def run_batch_tests(experiment_name): for test_name, prompt_text in test_prompts.items(): print(f"\nTesting: {test_name}") print(f" Prompt: {prompt_text[:60]}...") - - # Send message - message = client.agents.create_message( - thread_id=thread.id, - role="user", - content=prompt_text, + + # Create a fresh conversation for this test + conversation = openai_client.conversations.create() + + # Send user message into the conversation + openai_client.conversations.items.create( + conversation_id=conversation.id, + items=[{ + "type": "message", + "role": "user", + "content": prompt_text, + }], ) - - # Run agent - run = client.agents.create_and_process_run( - thread_id=thread.id, - assistant_id=agent.id, + + # Ask the agent to respond using the Responses API with agent_reference + response = openai_client.responses.create( + conversation=conversation.id, + extra_body={"agent_reference": {"name": agent_name, "type": "agent_reference"}}, + input="", ) - - # Get response - messages = client.agents.list_messages(thread_id=thread.id) - response = messages.data[0].content[0].text.value - - # Get token usage from run + + # Extract text from the response; fall back to str(response) if shape changes + try: + response_text = response.output[0].content[0].text + except Exception: + response_text = str(response) + + usage = getattr(response, "usage", None) token_usage = { - "prompt_tokens": run.usage.prompt_tokens if run.usage else None, - "completion_tokens": run.usage.completion_tokens if run.usage else None, - "total_tokens": run.usage.total_tokens if run.usage else None, + "prompt_tokens": getattr(usage, "input_tokens", None) if usage else None, + "completion_tokens": getattr(usage, "output_tokens", None) if usage else None, + "total_tokens": getattr(usage, "total_tokens", None) if usage else None, } - + # Store result results["test_results"].append({ "test_name": test_name, "prompt": prompt_text, - "response": response, + "response": response_text, "token_usage": token_usage, - "run_id": run.id, + "run_id": getattr(response, "id", None), }) - + print(f" Response captured ({token_usage['total_tokens']} tokens)") # Save results to experiment folder (at repository root)