diff --git a/.github/workflows/evaluate-agent.yml b/.github/workflows/evaluate-agent.yml index a12bf0c..dfd5cbe 100644 --- a/.github/workflows/evaluate-agent.yml +++ b/.github/workflows/evaluate-agent.yml @@ -2,10 +2,10 @@ name: Evaluate Trail Guide Agent on: # Uncomment the lines below to enable automatic evaluation on pull requests - # pull_request: - # branches: [main] - # paths: - # - 'src/agents/trail_guide_agent/**' + pull_request: + branches: [main] + paths: + - 'src/agents/trail_guide_agent/**' workflow_dispatch: permissions: @@ -44,9 +44,14 @@ jobs: env: AZURE_AI_PROJECT_ENDPOINT: ${{ secrets.AZURE_AI_PROJECT_ENDPOINT }} MODEL_NAME: ${{ vars.MODEL_NAME || 'gpt-4.1' }} + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} run: | - python src/evaluators/evaluate_agent.py > evaluation_results.txt + python src/evaluators/evaluate_agent.py > evaluation_results.txt 2>&1 || true cat evaluation_results.txt + # Fail the step if the script wrote an error marker + grep -q "Evaluation FAILED" evaluation_results.txt && exit 1 || exit 0 - name: Comment PR with results if: github.event_name == 'pull_request' @@ -55,6 +60,7 @@ jobs: script: | const fs = require('fs'); const results = fs.readFileSync('evaluation_results.txt', 'utf8'); + const reportUrl = '${{ steps.run.outputs.report_url }}' || 'Not available'; const body = `## 🎯 Agent Evaluation Results @@ -69,7 +75,7 @@ jobs: - 📊 [View full results in Azure AI Foundry Portal](${{ steps.run.outputs.report_url }}) + 📊 [View full results in Azure AI Foundry Portal](${reportUrl}) **Evaluation Criteria:** - Intent Resolution (score ≥ 3) diff --git a/docs/02-prompt-management.md b/docs/02-prompt-management.md index fdf84e8..b9a3c5b 100644 --- a/docs/02-prompt-management.md +++ b/docs/02-prompt-management.md @@ -74,6 +74,13 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. Sign in with your Azure credentials when prompted. + > ⚠️ **Important** + > In some environments, the VS Code integrated terminal may crash or close during the interactive login flow. + > If this happens, authenticate using explicit credentials instead: + > ```powershell + > az login --username --password + > ``` + 1. Provision resources: ```powershell @@ -98,6 +105,15 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. azd env get-values > .env ``` + > ⚠️ **Important – File Encoding** + > + > After generating the `.env` file, make sure it is saved using **UTF-8** encoding. + > + > In editors like **VS Code**, check the encoding indicator in the bottom-right corner. + > If it shows **UTF-16 LE** (or any encoding other than UTF-8), click it, choose **Save with Encoding**, and select **UTF-8**. + > + > Using the wrong encoding may cause environment variables to be read incorrectly. + This creates a `.env` file in your project root with all the provisioned resource information. 1. Add the agent configuration to your `.env` file: diff --git a/docs/03-design-optimize-prompts.md b/docs/03-design-optimize-prompts.md index ad4bd9b..f8513bc 100644 --- a/docs/03-design-optimize-prompts.md +++ b/docs/03-design-optimize-prompts.md @@ -74,6 +74,13 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. Sign in with your Azure credentials when prompted. + > ⚠️ **Important** + > In some environments, the VS Code integrated terminal may crash or close during the interactive login flow. + > If this happens, authenticate using explicit credentials instead: + > ```powershell + > az login --username --password + > ``` + 1. Provision resources: ```powershell @@ -98,6 +105,15 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. azd env get-values > .env ``` + > ⚠️ **Important – File Encoding** + > + > After generating the `.env` file, make sure it is saved using **UTF-8** encoding. + > + > In editors like **VS Code**, check the encoding indicator in the bottom-right corner. + > If it shows **UTF-16 LE** (or any encoding other than UTF-8), click it, choose **Save with Encoding**, and select **UTF-8**. + > + > Using the wrong encoding may cause environment variables to be read incorrectly. + This creates a `.env` file in your project root with all the provisioned resource information. ### Install Python dependencies diff --git a/docs/04-automated-evaluation.md b/docs/04-automated-evaluation.md index 5ca61e8..349c28a 100644 --- a/docs/04-automated-evaluation.md +++ b/docs/04-automated-evaluation.md @@ -14,9 +14,9 @@ This exercise takes approximately **40 minutes**. ## Introduction -In this exercise, you'll use Microsoft Foundry's cloud evaluators to automatically assess quality at scale for the Adventure Works Trail Guide Agent. You'll run evaluations against a large test dataset (200 query-response pairs) to validate quality metrics and establish an automated evaluation pipeline for future changes. +In this exercise, you'll use Microsoft Foundry's cloud evaluators to automatically assess quality at scale for the Adventure Works Trail Guide Agent. You'll run evaluations against a large test dataset (89 query-response pairs) to validate quality metrics and establish an automated evaluation pipeline for future changes. -**Scenario**: You're operating the Adventure Works Trail Guide Agent. You want to evaluate it against a large test dataset (200 query-response pairs) to validate quality metrics and establish an automated evaluation pipeline that can scale as your agent evolves. +**Scenario**: You're operating the Adventure Works Trail Guide Agent. You want to evaluate it against a large test dataset (89 query-response pairs) to validate quality metrics and establish an automated evaluation pipeline that can scale as your agent evolves. You'll use the following evaluation criteria—automated at scale: @@ -80,6 +80,13 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. Sign in with your Azure credentials when prompted. + > ⚠️ **Important** + > In some environments, the VS Code integrated terminal may crash or close during the interactive login flow. + > If this happens, authenticate using explicit credentials instead: + > ```powershell + > az login --username --password + > ``` + 1. Provision resources: ```powershell @@ -104,6 +111,15 @@ Now you'll use the Azure Developer CLI to deploy all required Azure resources. azd env get-values > .env ``` + > ⚠️ **Important – File Encoding** + > + > After generating the `.env` file, make sure it is saved using **UTF-8** encoding. + > + > In editors like **VS Code**, check the encoding indicator in the bottom-right corner. + > If it shows **UTF-16 LE** (or any encoding other than UTF-8), click it, choose **Save with Encoding**, and select **UTF-8**. + > + > Using the wrong encoding may cause environment variables to be read incorrectly. + This creates a `.env` file in your project root with all the provisioned resource information. ### Install Python dependencies @@ -159,7 +175,7 @@ Cloud evaluation follows a structured workflow: ### Dataset preparation -The repository includes `data/trail_guide_evaluation_dataset.jsonl` with 200 pre-generated query-response pairs covering diverse hiking scenarios. Each entry includes: +The repository includes `data/trail_guide_evaluation_dataset.jsonl` with 89 pre-generated query-response pairs covering diverse hiking scenarios. Each entry includes: - `query`: User question - `response`: Agent-generated answer @@ -206,7 +222,7 @@ First, examine the prepared dataset structure. (Get-Content data/trail_guide_evaluation_dataset.jsonl).Count ``` - Expected: 200 entries + Expected: 89 entries ### Understand the evaluation pipeline @@ -219,7 +235,7 @@ The script performs all evaluation steps automatically: 1. **Upload Dataset** - Uploads the JSONL dataset to Microsoft Foundry 2. **Define Evaluation** - Creates evaluation definition with quality evaluators (Intent Resolution, Relevance, Groundedness) 3. **Run Evaluation** - Starts the cloud evaluation run -4. **Poll for Completion** - Waits for evaluation to complete (5-10 minutes for 200 items) +4. **Poll for Completion** - Waits for evaluation to complete (5-10 minutes for 89 items) 5. **Display Results** - Retrieves and shows scoring statistics This single-script approach makes it easy to run evaluations both locally during development and automatically in CI/CD pipelines. @@ -279,7 +295,7 @@ Execute the complete evaluation pipeline with one command. Run ID: run-ghi789rst Status: running - This may take 5-10 minutes for 200 items... + This may take 5-10 minutes for 89 items... ================================================================================ Step 4: Polling for completion @@ -297,9 +313,9 @@ Execute the complete evaluation pipeline with one command. Report URL: https://.services.ai.azure.com/projects//evaluations/... Average Scores (1-5 scale, threshold: 3) - Intent Resolution: 4.52 (n=200) - Relevance: 4.41 (n=200) - Groundedness: 4.18 (n=200) + Intent Resolution: 4.52 (n=89) + Relevance: 4.41 (n=89) + Groundedness: 4.18 (n=89) Pass Rates (score >= 3) Intent Resolution: 96.0% @@ -316,7 +332,7 @@ Execute the complete evaluation pipeline with one command. 3. Document key findings and recommendations ``` - > **Note**: Evaluation runtime varies based on dataset size and model capacity. 200 items typically takes 5-15 minutes. + > **Note**: Evaluation runtime varies based on dataset size and model capacity. 89 items typically takes 5-15 minutes. 1. **Commit the results file** @@ -374,27 +390,54 @@ The evaluation script integrates seamlessly into GitHub Actions for automated PR 1. **Configure Azure authentication** - Create a service principal with Foundry project access: + Create a service principal for GitHub Actions: ```powershell - # Create service principal - az ad sp create-for-rbac --name "github-agent-evaluator" ` - --role "Azure AI Developer" ` - --scopes /subscriptions//resourceGroups//providers/Microsoft.MachineLearningServices/workspaces/ ` - --sdk-auth + az ad sp create-for-rbac --name "github-agent-evaluator" ``` - Configure federated identity for GitHub OIDC: + Note the `appId` value from the output — you will use it in the next steps. + + Assign the **Azure AI User** role at the account scope. This role has `Microsoft.CognitiveServices/*` wildcard data actions, which covers the `AIServices/agents/write` action required by the Foundry project evaluation API: + + ```powershell + az role assignment create ` + --assignee "" ` + --role "Azure AI User" ` + --scope "/subscriptions//resourceGroups//providers/Microsoft.CognitiveServices/accounts/" + ``` + + > **Important**: Use the `AZURE_AI_ACCOUNT_NAME` value from your `.env` file as ``. The `Azure AI Developer` role is **not sufficient** — it only covers `OpenAI/*`, `SpeechServices/*`, `ContentSafety/*`, and `MaaS/*` data actions, but not `AIServices/agents/write` which the Foundry project API requires. + + > **Tip**: If you set the optional `githubActionsPrincipalId` parameter when running `azd up`, the infrastructure deployment will create this role assignment automatically for future environments. + + Configure federated identity for GitHub OIDC so the workflow can authenticate without a secret. + + Create a file named `federated-credential.json` in your repository root: + + ```json + { + "name": "github-actions", + "issuer": "https://token.actions.githubusercontent.com", + "subject": "repo:/:ref:refs/heads/main", + "audiences": ["api://AzureADTokenExchange"] + } + ``` + + > **Note**: Replace `/` with your exact GitHub username and repository name. The subject is case-sensitive and must match exactly. + + Register the federated credential using the file: ```powershell az ad app federated-credential create ` - --id ` - --parameters '{ - "name": "github-actions", - "issuer": "https://token.actions.githubusercontent.com", - "subject": "repo:/:ref:refs/heads/main", - "audiences": ["api://AzureADTokenExchange"] - }' + --id "" ` + --parameters @federated-credential.json + ``` + + Once the credential is created successfully, delete the file — it contains no secrets but there is no reason to keep it in the repository: + + ```powershell + Remove-Item federated-credential.json ``` 1. **Review the PR evaluation workflow** @@ -483,7 +526,7 @@ Document your findings and create an analysis report. ## Evaluation Summary - Evaluated: 200 test cases + Evaluated: 89 test cases Time: ~10 minutes Scoring: GPT-4.1 as LLM judge (1-5 scale) @@ -521,7 +564,7 @@ Document your findings and create an analysis report. - **Scales** to hundreds/thousands of items efficiently - **Consistent** scoring criteria across all evaluations - - **Fast** turnaround (10 minutes for 200 items) + - **Fast** turnaround (10 minutes for 89 items) - **Repeatable** and trackable over time - **CI/CD ready** for integration into deployment pipelines - **Detailed reasoning** provided for each score @@ -580,7 +623,7 @@ Compare evaluation results between GPT-4.1 and GPT-4.1-mini to understand qualit ### Run evaluation on GPT-4.1-mini responses -1. Generate 200 responses from GPT-4.1-mini for the same queries. +1. Generate 89 responses from GPT-4.1-mini for the same queries. 1. Run cloud evaluation on both sets. @@ -610,7 +653,7 @@ Create `experiments/automated/model_comparison.md` with: **Resolution**: - Run `az login` to refresh Azure credentials -- Verify you have **Azure AI User** role on the Foundry project +- Verify the service principal has the **Azure AI User** role at the CognitiveServices account scope — this role has `Microsoft.CognitiveServices/*` wildcard data actions required for `AIServices/agents/write`. `Azure AI Developer` alone is **not sufficient** - Check `AZURE_AI_PROJECT_ENDPOINT` in `.env` file is correct and includes `/api/projects/` ### Evaluator scoring seems inconsistent diff --git a/evaluation_results.txt b/evaluation_results.txt index c0603f0..7577fab 100644 --- a/evaluation_results.txt +++ b/evaluation_results.txt @@ -1,75 +1,15 @@ -================================================================================ - Trail Guide Agent - Cloud Evaluation -================================================================================ - -Configuration: - Project: https://ai-account-u2favsrdpp24k.services.ai.azure.com/api/projects/ai-project-dev-tester - Model: gpt-4.1 - Dataset: trail-guide-evaluation-dataset (v1) - -================================================================================ -Step 1: Uploading evaluation dataset -================================================================================ - -Dataset: trail_guide_evaluation_dataset.jsonl -Uploading... - - Dataset version 1 already exists in Foundry. - Retrieving existing dataset ID... - ✓ Using existing dataset - Dataset ID: azureai://accounts/ai-account-u2favsrdpp24k/projects/ai-project-dev-tester/data/trail-guide-evaluation-dataset/versions/1 - -================================================================================ -Step 2: Creating evaluation definition -================================================================================ - -Configuration: - Judge Model: gpt-4.1 - Evaluators: Intent Resolution, Relevance, Groundedness - -Creating evaluation... - -✓ Evaluation definition created - Evaluation ID: eval_7d81e353fce74ab3a51105fac7f59ed8 - -================================================================================ -Step 3: Running cloud evaluation -================================================================================ - -✓ Evaluation run started - Run ID: evalrun_31596933faa44251a62247465b5d8fbd - Status: in_progress - -This may take 5-10 minutes for 200 items... - -================================================================================ -Step 4: Polling for completion -================================================================================ - [3675s] Status: in_progress - -✓ Evaluation completed in 3694 seconds - -================================================================================ -Step 5: Retrieving results -================================================================================ - -Evaluation Summary - Report URL: https://ai.azure.com/nextgen/r/ZDCVYALRRAyvc0GrsS4tgA,rg-dev-tester,,ai-account-u2favsrdpp24k,ai-project-dev-tester/build/evaluations/eval_7d81e353fce74ab3a51105fac7f59ed8/run/evalrun_31596933faa44251a62247465b5d8fbd - - [DEBUG] First item type : - [DEBUG] First item attrs: ['construct', 'copy', 'created_at', 'datasource_item', 'datasource_item_id', 'dict', 'eval_id', 'from_orm', 'id', 'json', 'model_computed_fields', 'model_config', 'model_construct', 'model_copy', 'model_dump', 'model_dump_json', 'model_extra', 'model_fields', 'model_fields_set', 'model_json_schema', 'model_parametrized_name', 'model_post_init', 'model_rebuild', 'model_validate', 'model_validate_json', 'model_validate_strings', 'object', 'parse_file', 'parse_obj', 'parse_raw', 'results', 'run_id', 'sample', 'schema', 'schema_json', 'status', 'to_dict', 'to_json', 'update_forward_refs', 'validate'] - [DEBUG] item.__dict__: {'id': '1', 'created_at': 1772201992, 'datasource_item': {'query': 'What essential gear do I need for a summer day hike?', 'response': 'For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.', 'ground_truth': 'Essential day hike gear includes footwear, water, food, sun protection, navigation tools, first aid, and emergency supplies.'}, 'datasource_item_id': 0, 'eval_id': 'eval_7d81e353fce74ab3a51105fac7f59ed8', 'object': 'eval.run.output_item', 'results': [Result(name='intent_resolution', passed=True, score=5.0, sample={'usage': {'prompt_tokens': 1984, 'completion_tokens': 61, 'total_tokens': 2045}, 'finish_reason': 'stop', 'model': 'gpt-4.1-2025-04-14', 'input': [{'role': 'user', 'content': '{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.", "tool_definitions": null}'}], 'output': [{'role': 'assistant', 'content': '{\n "explanation": "The user asked for essential gear for a summer day hike. The agent provided a thorough, accurate, and relevant list, including safety, hydration, navigation, and sun protection, fully resolving the user\'s intent with no notable omissions.",\n "score": 5\n}'}]}, type='azure_ai_evaluator', metric='intent_resolution', label='pass', reason="The user asked for essential gear for a summer day hike. The agent provided a thorough, accurate, and relevant list, including safety, hydration, navigation, and sun protection, fully resolving the user's intent with no notable omissions.", threshold=3), Result(name='relevance', passed=True, score=5.0, sample={'usage': {'prompt_tokens': 1678, 'completion_tokens': 63, 'total_tokens': 1741}, 'finish_reason': 'stop', 'model': 'gpt-4.1-2025-04-14', 'input': [{'role': 'user', 'content': '{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out."}'}], 'output': [{'role': 'assistant', 'content': '{\n "explanation": "The response provides a thorough list of essential gear for a summer day hike, including clothing, hydration, navigation, safety, and sun protection. It also adds practical advice about checking weather and trail conditions, offering both completeness and useful context.",\n "score": 5\n}'}]}, type='azure_ai_evaluator', metric='relevance', label='pass', reason='The response provides a thorough list of essential gear for a summer day hike, including clothing, hydration, navigation, safety, and sun protection. It also adds practical advice about checking weather and trail conditions, offering both completeness and useful context.', threshold=3), Result(name='groundedness', passed=True, score=5.0, sample={'usage': {'prompt_tokens': 1571, 'completion_tokens': 106, 'total_tokens': 1677}, 'finish_reason': 'stop', 'model': 'gpt-4.1-2025-04-14', 'input': [{'role': 'user', 'content': '{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.", "context": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out."}'}], 'output': [{'role': 'assistant', 'content': "Let's think step by step: The query asks for the essential gear needed for a summer day hike. The context provides a detailed list of essential gear and some additional advice. The response repeats the context almost verbatim, listing all the gear and advice without omitting or adding any information. There are no inaccuracies or missing details.\nThe response is fully accurate, complete, and directly grounded in the provided context, addressing the query thoroughly.\n5"}]}, type='azure_ai_evaluator', metric='groundedness', label='pass', reason='The response is fully accurate, complete, and directly grounded in the provided context, addressing the query thoroughly.', threshold=3)], 'run_id': 'evalrun_31596933faa44251a62247465b5d8fbd', 'sample': Sample(error=None, finish_reason='stop', input=[SampleInput(content='{"query": "What essential gear do I need for a summer day hike?", "response": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out.", "context": "For a summer day hike, essential gear includes: proper hiking boots with good ankle support, moisture-wicking clothing in layers, a daypack (20-30L), 2 liters of water, high-energy snacks, sun protection (hat, sunglasses, sunscreen SPF 30+), a basic first aid kit, map and compass or GPS device, headlamp with extra batteries, and a whistle for emergencies. Always check the weather forecast and trail conditions before heading out."}', role='user')], max_completion_tokens=None, model='gpt-4.1-2025-04-14', output=[SampleOutput(content="Let's think step by step: The query asks for the essential gear needed for a summer day hike. The context provides a detailed list of essential gear and some additional advice. The response repeats the context almost verbatim, listing all the gear and advice without omitting or adding any information. There are no inaccuracies or missing details.\nThe response is fully accurate, complete, and directly grounded in the provided context, addressing the query thoroughly.\n5", role='assistant')], seed=None, temperature=None, top_p=None, usage=SampleUsage(cached_tokens=None, completion_tokens=106, prompt_tokens=1571, total_tokens=1677)), 'status': 'completed'} ================================================================================ Trail Guide Agent - Evaluation Results ================================================================================ - Report URL : https://ai.azure.com/nextgen/r/ZDCVYALRRAyvc0GrsS4tgA,rg-dev-tester,,ai-account-u2favsrdpp24k,ai-project-dev-tester/build/evaluations/eval_7d81e353fce74ab3a51105fac7f59ed8/run/eva + Eval ID : eval_7c4c645fbe2c4e9e83d2c1b3af300106 + Run ID : evalrun_97c6ed6859324fc0bb01dce5df3ecb75 + Total items : 89 + Errored items: 0 + Scored items : 89 -================================================================================ -Cloud evaluation complete -================================================================================ +Average Scores (1-5 scale, threshold: 3) + No scores returned — open Azure AI Foundry portal > Evaluations for details. -Next steps: - 1. Review detailed results in Azure AI Foundry portal - 2. Analyze patterns in successful and failed evaluations - 3. Commit evaluation_results.txt and push so the PR workflow can use it \ No newline at end of file +Pass Rates (score >= 3) + No scores returned. \ No newline at end of file diff --git a/infra/core/ai/ai-project.bicep b/infra/core/ai/ai-project.bicep index c309fa9..c63eb1c 100644 --- a/infra/core/ai/ai-project.bicep +++ b/infra/core/ai/ai-project.bicep @@ -152,6 +152,20 @@ module aiConnections './connection.bicep' = [for (connection, index) in connecti } }] +// Azure AI User (53ca6127-db72-4b80-b1b0-d745d6d5456d) has Microsoft.CognitiveServices/* wildcard +// data actions, covering AIServices/agents/write required by the Foundry project API. +// Assign at the account scope so it applies to all projects under this account. +resource localUserAiUserRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + scope: aiAccount + name: guid(subscription().id, resourceGroup().id, principalId, '53ca6127-db72-4b80-b1b0-d745d6d5456d') + properties: { + principalId: principalId + principalType: principalType + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', '53ca6127-db72-4b80-b1b0-d745d6d5456d') + } +} + +// Keep Azure AI Developer at resource group scope for broader resource management access resource localUserAiDeveloperRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { scope: resourceGroup() name: guid(subscription().id, resourceGroup().id, principalId, '64702f94-c441-49e6-a78b-ef80e0188fee') @@ -172,6 +186,21 @@ resource localUserCognitiveServicesUserRoleAssignment 'Microsoft.Authorization/r } } +// Optional: assign Azure AI User to a GitHub Actions service principal so CI/CD +// workflows can call the Foundry project API (AIServices/agents/write). +@description('Optional. Object ID of the GitHub Actions service principal to grant Azure AI User role.') +param githubActionsPrincipalId string = '' + +resource githubActionsAiUserRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = if (!empty(githubActionsPrincipalId)) { + scope: aiAccount + name: guid(subscription().id, resourceGroup().id, githubActionsPrincipalId, '53ca6127-db72-4b80-b1b0-d745d6d5456d') + properties: { + principalId: githubActionsPrincipalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', '53ca6127-db72-4b80-b1b0-d745d6d5456d') + } +} + resource projectCognitiveServicesUserRoleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { scope: aiAccount name: guid(subscription().id, resourceGroup().id, project.name, '53ca6127-db72-4b80-b1b0-d745d6d5456d') diff --git a/infra/main.bicep b/infra/main.bicep index f5fcf9d..dc88666 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -110,6 +110,9 @@ param enableHostedAgents bool @description('Enable monitoring for the AI project') param enableMonitoring bool = true +@description('Optional. Object ID of the GitHub Actions service principal to grant Azure AI User role for CI/CD evaluation workflows.') +param githubActionsPrincipalId string = '' + // Tags that should be applied to all resources. // // Note that 'azd-service-name' tags should be applied separately to service host resources. @@ -152,6 +155,7 @@ module aiProject 'core/ai/ai-project.bicep' = { additionalDependentResources: dependentResources enableMonitoring: enableMonitoring enableHostedAgents: enableHostedAgents + githubActionsPrincipalId: githubActionsPrincipalId } } diff --git a/src/agents/trail_guide_agent/prompts/v1_instructions.txt b/src/agents/trail_guide_agent/prompts/v1_instructions.txt index 9611d32..3c5b8b7 100644 --- a/src/agents/trail_guide_agent/prompts/v1_instructions.txt +++ b/src/agents/trail_guide_agent/prompts/v1_instructions.txt @@ -1 +1 @@ -You are a helpful trail guide assistant for Adventure Works, an outdoor gear company. Help users with basic trail recommendations, safety tips, and gear suggestions for hiking and outdoor activities. Keep responses informative but concise. \ No newline at end of file +You are a helpful trail guide assistant for Adventure Works, an outdoor gear company. Help customers with basic trail recommendations, safety tips, and gear suggestions for hiking and outdoor activities. Keep responses informative but concise. \ No newline at end of file diff --git a/src/evaluators/evaluate_agent.py b/src/evaluators/evaluate_agent.py index bfeffb7..c7b31d4 100644 --- a/src/evaluators/evaluate_agent.py +++ b/src/evaluators/evaluate_agent.py @@ -189,6 +189,7 @@ def create_evaluation_definition(): "data_mapping": { "query": "{{item.query}}", "response": "{{item.response}}", + "context": "{{item.ground_truth}}", }, }, ] @@ -386,6 +387,16 @@ def retrieve_and_display_results(eval_object, run): print(f"\n Results saved to {RESULTS_FILE}") print(f" Commit this file so the GitHub Actions workflow can read it.") + # Emit report_url as a GitHub Actions step output when running in CI + report_url = getattr(run, "report_url", None) or ( + f"{endpoint.rstrip('/')}/evaluations/{eval_object.id}/runs/{run.id}" + ) + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + with open(github_output, "a", encoding="utf-8") as gh_out: + gh_out.write(f"report_url={report_url}\n") + print(f" GitHub Actions output set: report_url={report_url}") + return output_items