mlcommons · superdosh · May 28, 2025 · May 22, 2025 · May 28, 2025 · May 28, 2025
@@ -0,0 +1,19 @@
+# postgres env for local mlflow tracking server
+# you don't need to set these if mlflow is already running somewhere else
+# (in that case, you don't need postgres at all)
+POSTGRES_USER=mlflow
+POSTGRES_PASSWORD=mlflow
+POSTGRES_DB=mlflow
+POSTGRES_HOST=postgres
+POSTGRES_PORT=5432
+
+# jupyter config
+JUPYTER_TOKEN=changeme
+# suppress warning about no git availablity in jupyter container
+GIT_PYTHON_REFRESH=quiet
+# container uri for mlflow -- adjust this if you have a remote tracking server
+MLFLOW_TRACKING_URI=http://mlflow:8080
+# adjust this if you have a remote artifact store (e.g. gs, s3)
+MLFLOW_ARTIFACT_ROOT=/mlruns
+# this path is relative to where jupyter is started
+MODEL_SECRETS_PATH=./config/secrets.toml
@@ -0,0 +1,6 @@
+mlruns/
+secrets.toml
+.ipynb_checkpoints
+.python-version
+data/
+*.pyc
@@ -0,0 +1,16 @@
+FROM python:3.12-slim
+
+ENV PATH="/root/.local/bin:$PATH"
+# Used for the notebook server
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y pipx && \
+    pipx install poetry
+COPY pyproject.toml poetry.lock README.md ./
+RUN poetry install --no-interaction --no-ansi --no-root
+COPY src/ ./src/
+RUN echo $(poetry env info --path)
+RUN poetry install --no-interaction --no-ansi
+
+EXPOSE 8888
+CMD ["poetry", "run", "jupyter", "notebook", "--ip", "0.0.0.0", "--allow-root", "--no-browser", "--ServerApp.root_dir=/app/flightdeck"]
@@ -0,0 +1,5 @@
+FROM ghcr.io/mlflow/mlflow:latest
+
+# This isn't available on the base image, even though the 
+# default tracking server uses postgres.
+RUN pip install psycopg2-binary
@@ -0,0 +1,53 @@
+# ModelPlane
+
+Develop new evaluators / annotators.
+
+## Get Started
+
+You must have docker installed on your system. The
+given docker-compose.yaml file will start up:
+
+* mlflow tracking server + postgres
+* jupyter
+
+1.  Clone the repository:
+    ```bash
+    git clone https://github.com/mlcommons/modelplane.git
+    cd modelplane
+    ```
+1. Environment:
+    1. Adjust the .env file as needed. The committed .env / 
+    docker-compose.yaml will bring up mlflow, postgres, jupyter, set up
+    mlflow to use a local disk for artifact storage.
+    1. Set up secrets for accessing SUTs, as needed in 
+    `modelplane/flightdeck/config/secrets.toml`. See [modelbench](https://github.com/mlcommons/modelbench) for more details.
+    1. Stage your input data in `modelplane/flightdeck/data`. You can get a
+    sample input file [here](https://github.com/mlcommons/ailuminate/tree/main).
+1. Bring up the services:
+    ```bash
+    docker compose up -d
+    ```
+    Or if you are running mlflow somewhere else, you can bring up just jupyter with:
+    ```bash
+    docker compose up -d jupyter
+    ```
+1. Visit the [Jupyter Server](http://localhost:8888/?token=changeme). The
+   token is configured in the .env file. You shouldn't need to enter it 
+   more than once (until the server is restarted). You can get started with
+   the template notebook or create a new one.
+1. The runs can be monitored in MLFlow wherever you have that set up. If
+   local with the default setup, http://localhost:8080.
+
+## TODO
+
+- [ ] Scoring against ground truth (measurement runner functionality)
+- [ ] Support ensemble option
+- [ ] Support multiple annotators in single run
+- [ ] Confirm this works with cloud storage
+- [ ] Add test coverage
+- [ ] Support for data via remote DVC repo
+- [ ] Template with annotator that's served elsewhere
+- [ ] Missing safety runner functionality
+- [ ] Automated experiment names
+- [ ] `annotate` should add sut_id tag to its runs
+- [ ] Better handling of jupyter token
@@ -0,0 +1,40 @@
+services:
+  postgres:
+    image: postgres:17 # TODO: ensure compatibility with our GCP PostgreSQL
+    env_file: .env
+    volumes:
+      - pgdata:/var/lib/postgresql/data
+    ports:
+      - "5432:5432"
+
+  mlflow:
+    build:
+      dockerfile: Dockerfile.mlflow
+    env_file: .env
+    depends_on:
+      - postgres
+    # grab backend from .env, pass artifact root, if provided, otherwise local storage of artifacts
+    command: >
+      mlflow server
+        --backend-store-uri postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
+        --default-artifact-root /mlruns
+        --host 0.0.0.0
+        --port 8080
+    ports:
+      - "8080:8080"
+    volumes:
+      - ./mlruns:/mlruns
+
+  jupyter:
+    build:
+      context: .
+      dockerfile: Dockerfile.jupyter
+    env_file: .env
+    ports:
+      - "8888:8888"
+    volumes:
+      - ./flightdeck:/app/flightdeck
+      - ./mlruns:/mlruns
+
+volumes:
+  pgdata:
@@ -0,0 +1,195 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ab195250-6a0f-4176-a09d-3696d911203d",
+   "metadata": {},
+   "source": [
+    "# Annotator/Evaluator Development Template\n",
+    "\n",
+    "Use this template to iterate on a new annotator. Adjust the settings as needed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d2d5865-2cd7-4b81-a588-dfec27727643",
+   "metadata": {},
+   "source": [
+    "## Settings and imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f44e837c-05e9-4e62-916d-9884bb47839e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from modelplane.runways import responder, annotator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "414f9c85-d146-4119-854b-e009235aa4c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sut_id = \"llama-3.1-8b-instruct-turbo-together\"\n",
+    "experiment = \"experiment_name\"\n",
+    "prompts = \"data/airr_official_1.0_demo_en_us_prompt_set_release_reduced.csv\"\n",
+    "cache_dir = None\n",
+    "n_jobs = 4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "524e0529-4e51-45ae-b2ab-313915881f98",
+   "metadata": {},
+   "source": [
+    "## Define a new annotator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89674a6d-b2c5-42a3-9a0c-927101126877",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "from pydantic import BaseModel\n",
+    "\n",
+    "from modelgauge.annotator import CompletionAnnotator\n",
+    "from modelgauge.annotator_registry import ANNOTATORS\n",
+    "from modelgauge.annotators.llama_guard_annotator import LlamaGuardAnnotation\n",
+    "from modelgauge.single_turn_prompt_response import TestItem\n",
+    "from modelgauge.sut import SUTResponse\n",
+    "from modelgauge.tests.safe_v1 import BaseSafeTestVersion1\n",
+    "\n",
+    "class AnnotationRequest(BaseModel):\n",
+    "    test_item: TestItem\n",
+    "    response: SUTResponse\n",
+    "\n",
+    "\n",
+    "class NewAnnotator(CompletionAnnotator[LlamaGuardAnnotation]):\n",
+    "    \"\"\"Simple random implementation for demonstration. Uses LlamaGuardAnnotation\n",
+    "    for consistency with AILuminate benchmarks.\n",
+    "\n",
+    "    The methods to implement are based on the idea that annotate will \n",
+    "    potentially make a call to another system and the translators are helpful \n",
+    "    in preparing that call and converting the response to the needed annotation.\n",
+    "    \"\"\"\n",
+    "    # Probability of unsafe per category\n",
+    "    PROB_UNSAFE = 0.01\n",
+    "\n",
+    "    def translate_request(self, test_item: TestItem, response: SUTResponse):\n",
+    "        return AnnotationRequest(test_item=test_item, response=response)\n",
+    "\n",
+    "    def annotate(self, annotation_request: AnnotationRequest) -> LlamaGuardAnnotation:\n",
+    "        unsafe_categories = []\n",
+    "        for category in BaseSafeTestVersion1.hazards:\n",
+    "            if random.random() < self.PROB_UNSAFE:\n",
+    "                unsafe_categories.append(category)\n",
+    "        return LlamaGuardAnnotation(\n",
+    "            is_safe=not unsafe_categories,\n",
+    "            is_safe_logprob=random.random(),\n",
+    "            violation_categories=unsafe_categories,\n",
+    "        )\n",
+    "\n",
+    "    def translate_response(self, request: AnnotationRequest, response: LlamaGuardAnnotation) -> LlamaGuardAnnotation:\n",
+    "        return response\n",
+    "\n",
+    "\n",
+    "annotator_id = \"new_annotator\"\n",
+    "ANNOTATORS.register(NewAnnotator, annotator_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17760cd3-23fe-4c79-8882-475d8d7096ea",
+   "metadata": {},
+   "source": [
+    "## Run the model\n",
+    "\n",
+    "Save this run_id to avoid having to re-run the model later. The results are saved as an artifact in mlflow.\n",
+    "\n",
+    "NOTE: you may need to replace http://mlflow:8080 with http://localhost:8080 in the generated mlflow links."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b70d76d5-a3e1-4cc0-aeff-e71b6ff64825",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_id = responder.respond(\n",
+    "    sut_id=sut_id,\n",
+    "    experiment=experiment,\n",
+    "    prompts=prompts,\n",
+    "    cache_dir=cache_dir,\n",
+    "    n_jobs=n_jobs,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "740a8a85-c171-4d11-b094-cd617b14b6ed",
+   "metadata": {},
+   "source": [
+    "## Annotate the model\n",
+    "\n",
+    "The evaluation will be available in mlflow, and the artifact will be saved with that mlflow run for inspection of the output jsonl.\n",
+    "\n",
+    "NOTE: you may need to replace http://mlflow:8080 with http://localhost:8080 in the generated mlflow links."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06632c4d-90bd-4c2d-9c36-84e59dd8f190",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotator.annotate(\n",
+    "    annotator_id=annotator_id,\n",
+    "    experiment=experiment,\n",
+    "    response_run_id=run_id,\n",
+    "    cache_dir=cache_dir,\n",
+    "    n_jobs=n_jobs,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92b2b2c5-8f59-42c3-8415-9a6c4ae39291",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}