From b00c4c3974b97bb6f05bcc583a8f030b8c721337 Mon Sep 17 00:00:00 2001 From: David Joy <10147749+dmjoy@users.noreply.github.com> Date: Sun, 17 Nov 2024 20:41:09 -0500 Subject: [PATCH] Added final phase 1 evaluation configs and updated version for 0.5.5 release --- CHANGELOG.md | 11 +++++- README.md | 21 ++++------ .../aligned_adm_adept_eval.yaml | 37 ++++++++++++++++++ .../aligned_adm_soartech_eval.yaml | 39 +++++++++++++++++++ pyproject.toml | 2 +- 5 files changed, 94 insertions(+), 16 deletions(-) create mode 100644 align_system/configs/experiment/phase1_evaluation/aligned_adm_adept_eval.yaml create mode 100644 align_system/configs/experiment/phase1_evaluation/aligned_adm_soartech_eval.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 49f5fd5f..71dd47dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ This changelog follows the specifications detailed in: [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html), although we have not yet reached a `1.0.0` release. +## 0.5.5 + +### Added + +* Added Phase 1 Evaluation experiment configuration files +* Added ICL example selection method that gives larger weight to examples with the same characetr ids as the current probe. To use set `incontext.method` to `matching_characters`. +* Added ICL example selection method that gives larger weight to examples with the same action types as the current probe. To use set `incontext.method` to `matching_actions`. +* Added retrieved ICL examples to input-output.json + ## 0.5.4 ### Changed @@ -31,8 +40,6 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm * Added KDMA scaling factor option. Scale factors for each KDMA are added to `align_system/prompt_engineering/kdma_descriptions.yml` * Added heuristic treatment options component * Added incontext examples to the `input_output.json` files for comparative regression -* Added ICL example selection method that gives larger weight to examples with the same action types as the current probe. To use set `incontext.method` to `matching_actions`. -* Added ICL example selection method that gives larger weight to examples with the same characetr ids as the current probe. To use set `incontext.method` to `matching_characters`. ### Fixed diff --git a/README.md b/README.md index 1cb09c32..5f8786d4 100644 --- a/README.md +++ b/README.md @@ -112,9 +112,9 @@ captured in a new configuration file. We manage these experiments in delivered ADMs for the Metrics Evaluation (both to run on training data, and eval data). -## Dry Run Evaluation ADM Invocations +## Phase 1 Evaluation ADM Invocations -We've specified Hydra experiments for the Dry Run Evaluation ADMs. +We've specified Hydra experiments for the Phase 1 Evaluation ADMs. Note that by default these configurations attempt to connect to `https://darpaitm.caci.com` as the TA3 API endpoint, but this can be overridden with `interface.api_endpoint='http://127.0.0.1:8080'` on @@ -125,32 +125,27 @@ the command line. (Good candidate for a smoketest) ``` -run_align_system +experiment=dry_run_evaluation/random_eval_live +run_align_system +experiment=phase1_evaluation/random_eval_live ``` ### Baseline ADM ``` -run_align_system +experiment=dry_run_evaluation/outlines_baseline_eval_live +run_align_system +experiment=phase1_evaluation/baseline_eval_live ``` -### Aligned ADM 1 (Comparative Regression + ICL + Template ADM) (ADEPT eval scenarios) +### Aligned ADM Adept (Comparative Regression + ICL + Template ADM) (ADEPT eval scenarios) ``` -run_align_system +experiment=dry_run_evaluation/comparative_regression_icl_template_eval_live_adept +run_align_system +experiment=dry_run_evaluation/aligned_adm_adept_eval ``` -### Aligned ADM 1 (Comparative Regression + ICL + Template ADM) (SoarTech eval scenarios) +### Aligned ADM SoarTech (Comparative Regression + ICL + Template ADM) (SoarTech eval scenarios) ``` -run_align_system +experiment=dry_run_evaluation/comparative_regression_icl_template_eval_live_soartech +run_align_system +experiment=dry_run_evaluation/aligned_adm_soartech_eval ``` -### Aligned ADM 2 (Hybrid Regression ADM) - -``` -run_align_system +experiment=dry_run_evaluation/hybrid_regression_eval_live -``` ## Implementing a new ADM diff --git a/align_system/configs/experiment/phase1_evaluation/aligned_adm_adept_eval.yaml b/align_system/configs/experiment/phase1_evaluation/aligned_adm_adept_eval.yaml new file mode 100644 index 00000000..f0005a4d --- /dev/null +++ b/align_system/configs/experiment/phase1_evaluation/aligned_adm_adept_eval.yaml @@ -0,0 +1,37 @@ +# @package _global_ +defaults: + - override /adm: outlines_regression_aligned_comparative/incontext_phase1 + - override /interface: ta3 + +interface: + api_endpoint: "https://darpaitm.caci.com" + session_type: adept + training_session: null + username: "ALIGN-ADM-ComparativeRegression-Mistral-7B-Instruct-v0.2-ADEPT-10Sample" + +adm: + instance: + precision: half + sampler: + _target_: outlines.samplers.MultinomialSampler + temperature: 0.7 + model_name: mistralai/Mistral-7B-Instruct-v0.2 + inference_kwargs: + kdma_score_examples: true + num_samples: 10 + predict_outcomes: false + generator_batch_size: 5 + incontext: + sort_actions: true + normalization: null + number: 5 + leave_one_out_strategy: null + most_similar_first: false + +sort_available_actions: true +align_to_target: true +save_last_unstructured_state_per_scenario: true + +hydra: + run: + dir: 'phase1_eval_live/ALIGN-ADM-ComparativeRegression-Mistral-7B-Instruct-v0.2-ADEPT-10Sample/${now:%Y-%m-%d__%H-%M-%S}' diff --git a/align_system/configs/experiment/phase1_evaluation/aligned_adm_soartech_eval.yaml b/align_system/configs/experiment/phase1_evaluation/aligned_adm_soartech_eval.yaml new file mode 100644 index 00000000..8a16cb74 --- /dev/null +++ b/align_system/configs/experiment/phase1_evaluation/aligned_adm_soartech_eval.yaml @@ -0,0 +1,39 @@ +# @package _global_ +defaults: + - override /adm: outlines_regression_aligned_comparative/incontext_phase1 + - override /interface: ta3 + +interface: + api_endpoint: "https://darpaitm.caci.com" + session_type: soartech + training_session: null + username: "ALIGN-ADM-ComparativeRegression-Llama-3.2-3B-Instruct-SoarTech-MatchingChars" +adm: + instance: + precision: half + sampler: + _target_: outlines.samplers.GreedySampler + model_name: meta-llama/Llama-3.2-3B-Instruct + inference_kwargs: + distribution_matching: cumulative_kde + kde_norm: priornorm + priornorm_factor: 0.5 + kdma_score_examples: true + num_samples: 1 + predict_outcomes: false + generator_batch_size: 5 + incontext: + most_similar_first: false + sort_actions: true + normalization: null + number: 4 + leave_one_out_strategy: null + method: matching_characters + +force_determinism: true +align_to_target: true +save_last_unstructured_state_per_scenario: true + +hydra: + run: + dir: 'phase1_eval_live/ALIGN-ADM-ComparativeRegression-Llama-3.2-3B-Instruct-SoarTech-MatchingChars/${now:%Y-%m-%d__%H-%M-%S}' diff --git a/pyproject.toml b/pyproject.toml index 4bc45059..c8cd4a38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "align-system" -version = "0.5.4" +version = "0.5.5" description = "" authors = ["David Joy <10147749+dmjoy@users.noreply.github.com>"] readme = "README.md"