Merge branch 'main' into mp/aggregation-benchmarks

deepsense-ai · Sep 24, 2024 · 0193e4b · 0193e4b
2 parents f351b29 + 61e82b7
commit 0193e4b
Show file tree

Hide file tree

Showing 26 changed files with 440 additions and 67 deletions.
diff --git a/extra/prompt_tuning/README.md b/extra/prompt_tuning/README.md
@@ -2,31 +2,81 @@
 
 This folder contains scripts for prompt tuning and evaluation. Prompts (programs) used in dbally:
 
-- `FILTERING_ASSESSOR` - assesses whether a question requires filtering.
+- `FilteringAssessor` - assesses whether a question requires filtering.
+- `AggregationAssessor` - assesses whether a question requires aggregation.
 
 All evaluations are run on a dev split of the [BIRD](https://bird-bench.github.io/) dataset. For now, one configuration is available to run the suite against the `superhero` database.
 
 ## Usage
 
+### Train new prompts
+
+Tune `filtering-assessor` prompt on base signature using [COPRO](https://dspy-docs.vercel.app/docs/deep-dive/teleprompter/signature-optimizer#how-copro-works) optimizer on the `superhero` database with `gpt-3.5-turbo`:
+
+```bash
+python train.py prompt/type=filtering-assessor prompt/signature=baseline prompt/program=predict
+```
+
+Change optimizer to [MIPRO](https://dspy-docs.vercel.app/docs/cheatsheet#mipro):
+
+```bash
+python train.py prompt/type=filtering-assessor prompt/signature=baseline prompt/program=predict optimizer=mipro
+```
+
+Train multiple prompts:
+
+```bash
+python train.py --multirun \
+    prompt/type=filtering-assessor \
+    prompt/signature=baseline \
+    prompt/program=predict,cot
+```
+
+Tweak optimizer params to get different results:
+
+```bash
+python train.py \
+    optimizer=copro \
+    optimizer.params.breadth=2 \
+    optimizer.params.depth=3 \
+    optimizer.params.init_temperature=1.0
+```
+
+### Evaluate prompts
+
 Run evalution of filtering assessor baseline on the `superhero` database with `gpt-3.5-turbo`:
 
 ```bash
-python evaluate.py program=filtering-assessor-baseline
+python evaluate.py prompt/type=filtering-assessor prompt/signature=baseline prompt/program=predict
 ```
 
-Test multiple programs:
+Test multiple prompts:
+
+```bash
+python evaluate.py --multirun \
+    prompt/type=filtering-assessor \
+    prompt/signature=baseline \
+    prompt/program=predict,cot
+```
 
 ```bash
-python evaluate.py --multirun program=filtering-assessor-baseline,filtering-assessor-cot
+python evaluate.py --multirun \
+    prompt/type=aggregation-assessor \
+    prompt/signature=baseline \
+    prompt/program=predict,cot
 ```
 
 Compare prompt performance on multiple LLMs:
 
 ```bash
-python evaluate.py --multirun program=filtering-assessor-baseline llm=gpt-3.5-turbo,claude-3.5-sonnet
+python evaluate.py --multirun \
+    prompt/type=filtering-assessor \
+    prompt/signature=baseline \
+    prompt/program=predict \
+    llm=gpt-3.5-turbo,claude-3.5-sonnet
 ```
 
-### Log to Neptune
+#### Log to Neptune
 
 Before running the evaluation with Neptune, configure the following environment variables:
 

diff --git a/extra/prompt_tuning/config/data/superhero.yaml b/extra/prompt_tuning/config/data/superhero.yaml
@@ -1,4 +1,4 @@
-path: "micpst/bird-iql"
+path: "deepsense-ai/bird-iql"
 split: "dev"
 db_ids: ["superhero"]
 difficulties: ["simple", "moderate", "challenging"]
diff --git a/extra/prompt_tuning/config/config.yaml → extra/prompt_tuning/config/evaluate.yaml b/extra/prompt_tuning/config/config.yaml → extra/prompt_tuning/config/evaluate.yaml
@@ -1,7 +1,8 @@
 defaults:
   - data: superhero
   - llm: gpt-3.5-turbo
-  - program: filtering-assessor-baseline
+  - prompt: prompt
   - _self_
 
+num_threads: 32
 neptune: False
diff --git a/extra/prompt_tuning/config/optimizer/copro.yaml b/extra/prompt_tuning/config/optimizer/copro.yaml
@@ -0,0 +1,6 @@
+name: COPRO
+params:
+  breadth: 4
+  depth: 15
+  init_temperature: 1.5
+compile:
diff --git a/extra/prompt_tuning/config/optimizer/mipro.yaml b/extra/prompt_tuning/config/optimizer/mipro.yaml
@@ -0,0 +1,9 @@
+name: MIPRO
+params:
+  num_candidates: 3
+  init_temperature: 1.4
+
+compile:
+  max_bootstrapped_demos: 3
+  max_labeled_demos: 0
+  num_trials: 10
diff --git a/extra/prompt_tuning/config/program/filtering-assessor-baseline.yaml b/extra/prompt_tuning/config/program/filtering-assessor-baseline.yaml
diff --git a/extra/prompt_tuning/config/program/filtering-assessor-cot.yaml b/extra/prompt_tuning/config/program/filtering-assessor-cot.yaml
diff --git a/extra/prompt_tuning/config/prompt/program/cot.yaml b/extra/prompt_tuning/config/prompt/program/cot.yaml
@@ -0,0 +1 @@
+id: CoT
diff --git a/extra/prompt_tuning/config/prompt/program/coth.yaml b/extra/prompt_tuning/config/prompt/program/coth.yaml
@@ -0,0 +1 @@
+id: CoTH
diff --git a/extra/prompt_tuning/config/prompt/program/predict.yaml b/extra/prompt_tuning/config/prompt/program/predict.yaml
@@ -0,0 +1 @@
+id: Predict
diff --git a/extra/prompt_tuning/config/prompt/prompt.yaml b/extra/prompt_tuning/config/prompt/prompt.yaml
@@ -0,0 +1,8 @@
+defaults:
+  - type: filtering-assessor
+  - signature: baseline
+  - program: predict
+  - _self_
+
+num_threads: 32
+neptune: False
diff --git a/extra/prompt_tuning/config/prompt/signature/baseline.yaml b/extra/prompt_tuning/config/prompt/signature/baseline.yaml
@@ -0,0 +1 @@
+id: Baseline
diff --git a/extra/prompt_tuning/config/prompt/signature/optimized.yaml b/extra/prompt_tuning/config/prompt/signature/optimized.yaml
@@ -0,0 +1 @@
+id: Optimized
diff --git a/extra/prompt_tuning/config/prompt/type/aggregation-assessor.yaml b/extra/prompt_tuning/config/prompt/type/aggregation-assessor.yaml
@@ -0,0 +1 @@
+id: AggregationAssessor
diff --git a/extra/prompt_tuning/config/prompt/type/filtering-assessor.yaml b/extra/prompt_tuning/config/prompt/type/filtering-assessor.yaml
@@ -0,0 +1 @@
+id: FilteringAssessor
diff --git a/extra/prompt_tuning/config/train.yaml b/extra/prompt_tuning/config/train.yaml
@@ -0,0 +1,8 @@
+defaults:
+  - data: superhero
+  - llm: gpt-3.5-turbo
+  - prompt: prompt
+  - optimizer: copro
+  - _self_
+
+num_threads: 32
diff --git a/extra/prompt_tuning/evaluate.py b/extra/prompt_tuning/evaluate.py
@@ -1,6 +1,5 @@
 import asyncio
 import logging
-from enum import Enum
 from pathlib import Path
 
 import dspy
@@ -9,45 +8,32 @@
 from dspy.evaluate import Evaluate
 from neptune.utils import stringify_unsupported
 from omegaconf import DictConfig
-from tuning.loaders import IQLGenerationDataLoader
-from tuning.metrics import filtering_assess_acc
+from tuning import DATALOADERS, METRICS
 from tuning.programs import PROGRAMS
+from tuning.signatures import SIGNATURES
 from tuning.utils import save, serialize_results
 
 logging.getLogger("httpx").setLevel(logging.ERROR)
 logging.getLogger("anthropic").setLevel(logging.ERROR)
 log = logging.getLogger(__name__)
 
 
-class EvaluationType(Enum):
-    """
-    Enum representing the evaluation type.
-    """
-
-    FILTERING_ASSESSOR = "FILTERING_ASSESSOR"
-
-
-EVALUATION_DATALOADERS = {
-    EvaluationType.FILTERING_ASSESSOR.value: IQLGenerationDataLoader,
-}
-
-EVALUATION_METRICS = {
-    EvaluationType.FILTERING_ASSESSOR.value: filtering_assess_acc,
-}
-
-
 async def evaluate(config: DictConfig) -> None:
     """
     Function running evaluation for all datasets and evaluation tasks defined in hydra config.
 
     Args:
         config: Hydra configuration.
     """
-    log.info("Starting evaluation: %s", config.program.name)
+    signature_name = f"{config.prompt.type.id}{config.prompt.signature.id}"
+    program_name = f"{config.prompt.type.id}{config.prompt.program.id}"
+
+    log.info("Starting evaluation: %s(%s) program", program_name, signature_name)
 
-    dataloader = EVALUATION_DATALOADERS[config.program.type](config)
-    metric = EVALUATION_METRICS[config.program.type]
-    program = PROGRAMS[config.program.name]()
+    dataloader = DATALOADERS[config.prompt.type.id](config)
+    metric = METRICS[config.prompt.type.id]
+    signature = SIGNATURES[signature_name]
+    program = PROGRAMS[program_name](signature)
 
     dataset = await dataloader.load()
 
@@ -57,7 +43,7 @@ async def evaluate(config: DictConfig) -> None:
     evaluator = Evaluate(
         devset=dataset,
         metric=metric,
-        num_threads=32,
+        num_threads=config.num_threads,
         display_progress=True,
         return_outputs=True,
     )
@@ -75,8 +61,9 @@ async def evaluate(config: DictConfig) -> None:
         run = neptune.init_run()
         run["sys/tags"].add(
             [
-                config.program.type,
-                config.program.name,
+                config.prompt.type.id,
+                config.prompt.signature.id,
+                config.prompt.program.id,
                 *config.data.db_ids,
                 *config.data.difficulties,
             ]
@@ -86,7 +73,7 @@ async def evaluate(config: DictConfig) -> None:
         run["evaluation/results.json"].upload(results_file.as_posix())
 
 
-@hydra.main(config_path="config", config_name="config", version_base="3.2")
+@hydra.main(config_path="config", config_name="evaluate", version_base="3.2")
 def main(config: DictConfig) -> None:
     """
     Function running evaluation for all datasets and evaluation tasks defined in hydra config.

diff --git a/extra/prompt_tuning/train.py b/extra/prompt_tuning/train.py
@@ -0,0 +1,72 @@
+import asyncio
+import logging
+from pathlib import Path
+
+import dspy
+import dspy.teleprompt
+import hydra
+from omegaconf import DictConfig
+from tuning import DATALOADERS, METRICS
+from tuning.programs import PROGRAMS
+from tuning.signatures import SIGNATURES
+
+logging.getLogger("httpx").setLevel(logging.ERROR)
+logging.getLogger("anthropic").setLevel(logging.ERROR)
+log = logging.getLogger(__name__)
+
+
+async def train(config: DictConfig) -> None:
+    """
+    Function running training for all datasets and training tasks defined in hydra config.
+
+    Args:
+        config: Hydra configuration.
+    """
+    signature_name = f"{config.prompt.type.id}{config.prompt.signature.id}"
+    program_name = f"{config.prompt.type.id}{config.prompt.program.id}"
+
+    log.info("Starting training: %s(%s) program with %s optimizer", program_name, signature_name, config.optimizer.name)
+
+    dataloader = DATALOADERS[config.prompt.type.id](config)
+    metric = METRICS[config.prompt.type.id]
+    signature = SIGNATURES[signature_name]
+    program = PROGRAMS[program_name](signature)
+
+    dataset = await dataloader.load()
+
+    lm = dspy.__dict__[config.llm.provider](model=config.llm.model_name)
+    dspy.settings.configure(lm=lm)
+
+    optimizer = dspy.teleprompt.__dict__[config.optimizer.name](metric=metric, **config.optimizer.params)
+    compiled_program = optimizer.compile(
+        student=program,
+        trainset=dataset,
+        eval_kwargs={
+            "num_threads": config.num_threads,
+            "display_progress": True,
+        },
+        **(config.optimizer.compile or {}),
+    )
+
+    log.info("Training finished. Saving compiled program...")
+
+    output_dir = Path(hydra.core.hydra_config.HydraConfig.get().runtime.output_dir)
+    program_file = output_dir / f"{program.__class__.__name__}Optimized.json"
+    compiled_program.save(program_file)
+
+    log.info("Compiled program saved under directory: %s", output_dir)
+
+
+@hydra.main(config_path="config", config_name="train", version_base="3.2")
+def main(config: DictConfig) -> None:
+    """
+    Function running evaluation for all datasets and evaluation tasks defined in hydra config.
+
+    Args:
+        config: Hydra configuration.
+    """
+    asyncio.run(train(config))
+
+
+if __name__ == "__main__":
+    main()  # pylint: disable=no-value-for-parameter
diff --git a/extra/prompt_tuning/tuning/__init__.py b/extra/prompt_tuning/tuning/__init__.py
@@ -0,0 +1,24 @@
+from enum import Enum
+
+from .loaders import IQLGenerationDataLoader
+from .metrics import aggregation_assess_acc, filtering_assess_acc
+
+
+class ProgramType(Enum):
+    """
+    Program types.
+    """
+
+    FILTERING_ASSESSOR = "FilteringAssessor"
+    AGGREGATION_ASSESSOR = "AggregationAssessor"
+
+
+DATALOADERS = {
+    ProgramType.FILTERING_ASSESSOR.value: IQLGenerationDataLoader,
+    ProgramType.AGGREGATION_ASSESSOR.value: IQLGenerationDataLoader,
+}
+
+METRICS = {
+    ProgramType.FILTERING_ASSESSOR.value: filtering_assess_acc,
+    ProgramType.AGGREGATION_ASSESSOR.value: aggregation_assess_acc,
+}
diff --git a/extra/prompt_tuning/tuning/loaders.py b/extra/prompt_tuning/tuning/loaders.py
@@ -1,16 +1,17 @@
 from abc import ABC, abstractmethod
-from typing import Dict, Iterable, List
+from typing import Iterable, List
 
 import dspy.datasets
 from dspy import Example
+from omegaconf import DictConfig
 
 
 class DataLoader(ABC):
     """
     Data loader.
     """
 
-    def __init__(self, config: Dict) -> None:
+    def __init__(self, config: DictConfig) -> None:
         self.config = config
 
     @abstractmethod

diff --git a/extra/prompt_tuning/tuning/metrics/__init__.py b/extra/prompt_tuning/tuning/metrics/__init__.py
@@ -1,3 +1,3 @@
-from .iql import filtering_assess_acc
+from .iql import aggregation_assess_acc, filtering_assess_acc
 
-__all__ = ["filtering_assess_acc"]
+__all__ = ["aggregation_assess_acc", "filtering_assess_acc"]