diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md new file mode 100644 index 00000000..4e2853c5 --- /dev/null +++ b/examples/quickstart/README.md @@ -0,0 +1,47 @@ +# Semantic Router Quickstart + +> ⚠️ This is an initial skeleton for the 10-minute quickstart workflow. Content will be expanded in follow-up tasks. + +## Goal +Provide a single command path (`make quickstart`) that prepares the router, runs a small evaluation, and surfaces a concise report so that new users can validate the system within 10 minutes. + +## Structure +- `quickstart.sh` – orchestrates dependency checks, model downloads, and service startup. +- `quick-eval.sh` – executes a minimal benchmark run and captures results. +- `config-quickstart.yaml` – opinionated defaults for running the router locally. +- `sample-data/` – trimmed datasets used for fast evaluation. +- `templates/` – report and config templates shared by quickstart scripts. + +## Next Steps +1. Teach the benchmark loader to honor `QUICKSTART_SAMPLE_ROOT` so local JSONL slices are used offline. +2. Add a Makefile target (`quickstart`) that chains router bootstrap and quick evaluation. +3. Create CI smoke tests that run the 10-minute flow with the trimmed datasets. + +## Quick Evaluation +Run the standalone evaluator once the router is healthy. A typical flow looks like: + +```bash +./examples/quickstart/quickstart.sh & # starts router (Ctrl+C to stop) +./examples/quickstart/quick-eval.sh --dataset mmlu --samples 5 --mode router +``` + +The evaluation script will place raw artifacts under `examples/quickstart/results//raw` and derive: +- `quickstart-summary.csv` – compact metrics table for spreadsheets or dashboards. +- `quickstart-report.md` – Markdown summary suitable for PRs or runbooks. + +Key flags: +- `--mode router|vllm|both` to toggle which side runs. +- `--samples` to tune runtime vs. statistical confidence. +- `--output-dir` for custom destinations (defaults to timestamped folder). +- All settings also respect `QUICKSTART_*` environment overrides. + +## Local Sample Data +The `sample-data/` directory now includes trimmed JSONL slices for quick runs: +- `mmlu-sample.jsonl` – 10 multi-category academic questions. +- `arc-sample.jsonl` – 10 middle-school science questions with ARC-style options. + +Each record follows the same schema that the benchmark loader expects (`question_id`, `category`, `question`, `options`, `answer`, optional `cot_content`). Sizes stay under 10 KB per file so the quickstart remains lightweight. + +**Integration hook**: upcoming work will extend `bench/vllm_semantic_router_bench` to read from these JSONL files whenever `QUICKSTART_SAMPLE_ROOT` is set (falling back to Hugging Face datasets otherwise). Keep the files committed and deterministic so that the automated 10-minute flow can depend on them once the loader change lands. + + diff --git a/examples/quickstart/config-quickstart.yaml b/examples/quickstart/config-quickstart.yaml new file mode 100644 index 00000000..9d7552d7 --- /dev/null +++ b/examples/quickstart/config-quickstart.yaml @@ -0,0 +1,90 @@ +# Quickstart configuration tuned for a single-node developer setup. +# Keeps routing options minimal while remaining compatible with the default assets +# shipped by `make download-models`. + +bert_model: + model_id: sentence-transformers/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: false + backend_type: "memory" + +prompt_guard: + enabled: false + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + threshold: 0.6 + use_cpu: true + use_modernbert: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + +vllm_endpoints: + - name: "local-vllm" + address: "127.0.0.1" + port: 8000 + models: + - "openai/gpt-oss-20b" + weight: 1 + +model_config: + "openai/gpt-oss-20b": + preferred_endpoints: ["local-vllm"] + reasoning_family: "gpt-oss" + pii_policy: + allow_by_default: true + +categories: + - name: general + system_prompt: "You are a helpful and knowledgeable assistant. Provide concise, accurate answers." + model_scores: + - model: openai/gpt-oss-20b + score: 0.7 + use_reasoning: false + + - name: reasoning + system_prompt: "You explain your reasoning with clear numbered steps before giving a final answer." + model_scores: + - model: openai/gpt-oss-20b + score: 0.6 + use_reasoning: true + + - name: safety + system_prompt: "You prioritize safe completions and refuse harmful requests." + model_scores: + - model: openai/gpt-oss-20b + score: 0.5 + use_reasoning: false + +default_model: openai/gpt-oss-20b + +reasoning_families: + gpt-oss: + type: "chat_template_kwargs" + parameter: "thinking" + +api: + batch_classification: + metrics: + enabled: false + +# Tool auto-selection is available but disabled for quickstart. +tools: + enabled: false + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true diff --git a/examples/quickstart/quick-eval.sh b/examples/quickstart/quick-eval.sh new file mode 100755 index 00000000..274e6d92 --- /dev/null +++ b/examples/quickstart/quick-eval.sh @@ -0,0 +1,292 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +RESULTS_ROOT="${SCRIPT_DIR}/results" +DEFAULT_DATASET="mmlu" +DEFAULT_SAMPLES=${QUICKSTART_SAMPLES:-5} +DEFAULT_MODE="${QUICKSTART_MODE:-router}" +DEFAULT_ROUTER_MODELS="${QUICKSTART_ROUTER_MODELS:-auto}" +DEFAULT_VLLM_MODELS="${QUICKSTART_VLLM_MODELS:-openai/gpt-oss-20b}" +DEFAULT_ROUTER_ENDPOINT="${ROUTER_ENDPOINT:-http://127.0.0.1:8801/v1}" +DEFAULT_VLLM_ENDPOINT="${VLLM_ENDPOINT:-http://127.0.0.1:8000/v1}" +DEFAULT_SEED=${QUICKSTART_SEED:-42} +REQUIRED_COMMANDS=() +REQUIRED_PY_MODULES=(numpy pandas datasets openai) + +PYTHON_BIN="${PYTHON_BIN:-}" +if [[ -z "$PYTHON_BIN" ]]; then + if [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python3" ]]; then + PYTHON_BIN="${VIRTUAL_ENV}/bin/python3" + elif [[ -n "${VIRTUAL_ENV:-}" && -x "${VIRTUAL_ENV}/bin/python" ]]; then + PYTHON_BIN="${VIRTUAL_ENV}/bin/python" + elif command -v python3 >/dev/null 2>&1; then + PYTHON_BIN="$(command -v python3)" + elif command -v python >/dev/null 2>&1; then + PYTHON_BIN="$(command -v python)" + else + echo '[ERROR] No python interpreter found in PATH or VIRTUAL_ENV' >&2 + exit 1 + fi +fi + +usage() { + cat <<'USAGE' +Usage: quick-eval.sh [OPTIONS] + +Options: + --dataset NAME Dataset identifier (default: mmlu) + --samples N Samples per category (default: 5 or $QUICKSTART_SAMPLES) + --mode MODE router|vllm|both (default: router or $QUICKSTART_MODE) + --router-endpoint URL Router endpoint (default: env ROUTER_ENDPOINT or http://127.0.0.1:8801/v1) + --vllm-endpoint URL vLLM endpoint (default: env VLLM_ENDPOINT or http://127.0.0.1:8000/v1) + --router-models LIST Space-separated router models (default: auto) + --vllm-models LIST Space-separated vLLM models (default: openai/gpt-oss-20b) + --output-dir DIR Directory to store run artifacts (default: examples/quickstart/results/) + --seed N Random seed (default: 42 or $QUICKSTART_SEED) + --help Show this help message + +Environment overrides: + QUICKSTART_SAMPLES, QUICKSTART_MODE, QUICKSTART_ROUTER_MODELS, + QUICKSTART_VLLM_MODELS, QUICKSTART_SEED, ROUTER_ENDPOINT, VLLM_ENDPOINT. + +The script launches the benchmark module with quickstart-friendly defaults and +emits both CSV and Markdown summaries for the generated results. +USAGE +} + +log() { + local level="$1" + shift + printf '[%s] %s\n' "$level" "$*" +} + +die() { + log "ERROR" "$*" + exit 1 +} + +parse_args() { + DATASET="$DEFAULT_DATASET" + SAMPLES="$DEFAULT_SAMPLES" + MODE="$DEFAULT_MODE" + ROUTER_ENDPOINT="$DEFAULT_ROUTER_ENDPOINT" + VLLM_ENDPOINT="$DEFAULT_VLLM_ENDPOINT" + ROUTER_MODELS=("$DEFAULT_ROUTER_MODELS") + VLLM_MODELS=("$DEFAULT_VLLM_MODELS") + OUTPUT_DIR="" + SEED="$DEFAULT_SEED" + + while [[ $# -gt 0 ]]; do + case "$1" in + --dataset) + DATASET="$2" + shift 2 + ;; + --samples) + SAMPLES="$2" + shift 2 + ;; + --mode) + MODE="$2" + shift 2 + ;; + --router-endpoint) + ROUTER_ENDPOINT="$2" + shift 2 + ;; + --vllm-endpoint) + VLLM_ENDPOINT="$2" + shift 2 + ;; + --router-models) + shift + ROUTER_MODELS=() + while [[ $# -gt 0 ]] && [[ $1 != --* ]]; do + ROUTER_MODELS+=("$1") + shift + done + ;; + --vllm-models) + shift + VLLM_MODELS=() + while [[ $# -gt 0 ]] && [[ $1 != --* ]]; do + VLLM_MODELS+=("$1") + shift + done + ;; + --output-dir) + OUTPUT_DIR="$2" + shift 2 + ;; + --seed) + SEED="$2" + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done + + case "$MODE" in + router|vllm|both) ;; + *) die "Invalid --mode '$MODE' (expected router|vllm|both)" ;; + esac + + if [[ -z "$OUTPUT_DIR" ]]; then + local timestamp + timestamp="$(date +%Y%m%d_%H%M%S)" + OUTPUT_DIR="${RESULTS_ROOT}/${timestamp}" + fi +} + +require_commands() { + local missing=() + for cmd in "${REQUIRED_COMMANDS[@]}"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + missing+=("$cmd") + fi + done + if [[ ${#missing[@]} -gt 0 ]]; then + die "Missing required commands: ${missing[*]}" + fi +} + +check_python_modules() { + if ! "$PYTHON_BIN" - "${REQUIRED_PY_MODULES[@]}" <<'PY'; then +import importlib +import sys +missing = [] +for name in sys.argv[1:]: + try: + importlib.import_module(name) + except Exception: + missing.append(name) +if missing: + raise SystemExit("Missing Python modules: " + ", ".join(missing)) +PY + die "Missing Python modules. Run 'pip install -r bench/requirements.txt'" + fi +} + +prepare_dirs() { + mkdir -p "$RESULTS_ROOT" + mkdir -p "$OUTPUT_DIR" +} + +run_benchmark() { + local raw_dir="${OUTPUT_DIR}/raw" + mkdir -p "$raw_dir" + + local cmd=("$PYTHON_BIN" -m vllm_semantic_router_bench.router_reason_bench_multi_dataset + --dataset "$DATASET" + --samples-per-category "$SAMPLES" + --output-dir "$raw_dir" + --seed "$SEED" + --router-endpoint "$ROUTER_ENDPOINT" + ) + + if [[ "$MODE" == "router" || "$MODE" == "both" ]]; then + cmd+=(--run-router --router-models "${ROUTER_MODELS[@]}") + fi + + if [[ "$MODE" == "vllm" || "$MODE" == "both" ]]; then + cmd+=(--run-vllm --vllm-endpoint "$VLLM_ENDPOINT" --vllm-models "${VLLM_MODELS[@]}" --vllm-exec-modes NR) + fi + + log "INFO" "Running benchmark: ${cmd[*]}" + ( + cd "$ROOT_DIR" + PYTHONPATH="$ROOT_DIR/bench${PYTHONPATH:+:$PYTHONPATH}" \ + ROUTER_ENDPOINT="$ROUTER_ENDPOINT" \ + VLLM_ENDPOINT="$VLLM_ENDPOINT" \ + "${cmd[@]}" + ) || die "Benchmark run failed" + + RAW_DIR="$raw_dir" +} + +collect_summaries() { + mapfile -t SUMMARY_FILES < <(find "$RAW_DIR" -type f -name summary.json -print | sort) + if [[ ${#SUMMARY_FILES[@]} -eq 0 ]]; then + die "No summary.json files produced under $RAW_DIR" + fi + + SUMMARY_CSV="$OUTPUT_DIR/quickstart-summary.csv" + REPORT_MD="$OUTPUT_DIR/quickstart-report.md" + + "$PYTHON_BIN" - "$SUMMARY_CSV" "$REPORT_MD" "${SUMMARY_FILES[@]}" <<'PY' +import json +import sys +from datetime import datetime +from pathlib import Path + +summary_csv = Path(sys.argv[1]) +report_md = Path(sys.argv[2]) +summary_paths = [Path(p) for p in sys.argv[3:]] + +rows = [] +for path in summary_paths: + data = json.loads(path.read_text()) + rows.append({ + "dataset": data.get("dataset"), + "model": data.get("model"), + "overall_accuracy": data.get("overall_accuracy"), + "avg_response_time": data.get("avg_response_time"), + "avg_total_tokens": data.get("avg_total_tokens"), + "total_questions": data.get("total_questions"), + "successful_queries": data.get("successful_queries"), + "failed_queries": data.get("failed_queries"), + "source": str(path.relative_to(summary_csv.parent)) + }) + +rows.sort(key=lambda r: (r["dataset"] or "", r["model"] or "")) + +csv_lines = ["dataset,model,overall_accuracy,avg_response_time,avg_total_tokens,total_questions,successful_queries,failed_queries,source"] +for row in rows: + csv_lines.append( + f"{row['dataset']},{row['model']},{row['overall_accuracy']},{row['avg_response_time']},{row['avg_total_tokens']},{row['total_questions']},{row['successful_queries']},{row['failed_queries']},{row['source']}" + ) +summary_csv.write_text("\n".join(csv_lines) + "\n") + +lines = ["# Quickstart Evaluation Report", "", f"Generated: {datetime.utcnow().isoformat()}Z", ""] +lines.append("| Dataset | Model | Accuracy | Avg Latency (s) | Avg Tokens | Samples |") +lines.append("| --- | --- | --- | --- | --- | --- |") +for row in rows: + accuracy = f"{row['overall_accuracy']:.3f}" if isinstance(row['overall_accuracy'], (int, float)) else "N/A" + latency = f"{row['avg_response_time']:.2f}" if isinstance(row['avg_response_time'], (int, float)) else "N/A" + tokens = f"{row['avg_total_tokens']:.1f}" if isinstance(row['avg_total_tokens'], (int, float)) else "N/A" + total = row['total_questions'] if row['total_questions'] is not None else 0 + success = row['successful_queries'] if row['successful_queries'] is not None else 0 + lines.append(f"| {row['dataset']} | {row['model']} | {accuracy} | {latency} | {tokens} | {success}/{total} |") + +lines.append("") +lines.append("## Source Artifacts") +lines.append("") +for row in rows: + lines.append(f"- `{row['source']}`") + +report_md.write_text("\n".join(lines) + "\n") +PY + + log "INFO" "Summary CSV: $SUMMARY_CSV" + log "INFO" "Report Markdown: $REPORT_MD" +} + +main() { + parse_args "$@" + require_commands + log "INFO" "Using python interpreter: ${PYTHON_BIN}" + check_python_modules + prepare_dirs + run_benchmark + collect_summaries +} + +main "$@" diff --git a/examples/quickstart/quickstart.sh b/examples/quickstart/quickstart.sh new file mode 100755 index 00000000..dec978ce --- /dev/null +++ b/examples/quickstart/quickstart.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +CONFIG_FILE="${SCRIPT_DIR}/config-quickstart.yaml" +ROUTER_BIN="${ROOT_DIR}/bin/router" +ROUTER_LOG_DIR="${SCRIPT_DIR}/logs" +ROUTER_LOG_FILE="${ROUTER_LOG_DIR}/router.log" +HEALTH_URL="${QUICKSTART_HEALTH_URL:-http://127.0.0.1:8801/health}" +REQUIRED_COMMANDS=(make go cargo rustc python3 curl) +SKIP_DOWNLOAD=false +SKIP_BUILD=false +SKIP_START=false +ROUTER_HEALTH_TIMEOUT=${QUICKSTART_HEALTH_TIMEOUT:-60} + +usage() { + cat <<'USAGE' +Usage: quickstart.sh [OPTIONS] + +Options: + --skip-download Do not run `make download-models` (expects assets present) + --skip-build Do not run `make build` + --skip-start Skip starting the router (useful for smoke builds) + -h, --help Show this help message + +Environment overrides: + QUICKSTART_HEALTH_URL Health probe URL (default http://127.0.0.1:8801/health) + QUICKSTART_HEALTH_TIMEOUT Seconds to wait for router health (default 60) +USAGE +} + +log() { + local level="$1" + shift + printf '[%s] %s\n' "$level" "$*" +} + +die() { + log "ERROR" "$*" + exit 1 +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --skip-download) + SKIP_DOWNLOAD=true + shift + ;; + --skip-build) + SKIP_BUILD=true + shift + ;; + --skip-start) + SKIP_START=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + die "Unknown option: $1" + ;; + esac + done +} + +require_commands() { + local missing=() + for cmd in "${REQUIRED_COMMANDS[@]}"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + missing+=("$cmd") + fi + done + + if [[ ${#missing[@]} -gt 0 ]]; then + log "ERROR" "Missing required commands: ${missing[*]}" + log "ERROR" "Install the missing tooling and re-run quickstart." + exit 1 + fi +} + +ensure_config_ready() { + if [[ ! -f "$CONFIG_FILE" ]]; then + die "Expected config at $CONFIG_FILE" + fi + + if grep -q "TODO" "$CONFIG_FILE"; then + log "WARN" "Config file contains TODO markers; router bootstrap may fail until populated." + fi +} + +run_make_target() { + local target="$1" + shift || true + log "INFO" "Running make $target" + (cd "$ROOT_DIR" && make "$target" "$@") || die "make $target failed" +} + +build_assets() { + if [[ "$SKIP_DOWNLOAD" != true ]]; then + run_make_target download-models + else + log "INFO" "Skipping model download as requested" + fi + + if [[ "$SKIP_BUILD" != true ]]; then + run_make_target build + else + log "INFO" "Skipping build as requested" + fi +} + +wait_for_health() { + local elapsed=0 + while (( elapsed < ROUTER_HEALTH_TIMEOUT )); do + if curl -fsS --max-time 2 "$HEALTH_URL" >/dev/null 2>&1; then + log "INFO" "Router is healthy at $HEALTH_URL" + return 0 + fi + sleep 1 + ((elapsed++)) + done + + log "ERROR" "Router failed health check within ${ROUTER_HEALTH_TIMEOUT}s" + if [[ -f "$ROUTER_LOG_FILE" ]]; then + log "INFO" "Tail of router log:" + tail -n 40 "$ROUTER_LOG_FILE" + fi + return 1 +} + +cleanup_router() { + if [[ -n "${ROUTER_PID:-}" ]] && kill -0 "$ROUTER_PID" >/dev/null 2>&1; then + log "INFO" "Stopping router (PID $ROUTER_PID)" + kill "$ROUTER_PID" >/dev/null 2>&1 || true + wait "$ROUTER_PID" 2>/dev/null || true + fi +} + +start_router() { + ensure_config_ready + mkdir -p "$ROUTER_LOG_DIR" + + if [[ ! -x "$ROUTER_BIN" ]]; then + die "Router binary not found at $ROUTER_BIN; run with --skip-start to build only" + fi + + log "INFO" "Launching router with $CONFIG_FILE" + local ld_path="${ROOT_DIR}/candle-binding/target/release" + ( + cd "$ROOT_DIR" || exit 1 + export LD_LIBRARY_PATH="$ld_path${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + export DYLD_LIBRARY_PATH="$ld_path${DYLD_LIBRARY_PATH:+:$DYLD_LIBRARY_PATH}" + exec "$ROUTER_BIN" -config="$CONFIG_FILE" + ) >"$ROUTER_LOG_FILE" 2>&1 & + + ROUTER_PID=$! + if [[ -z "${ROUTER_PID}" ]]; then + die "Failed to spawn router" + fi + log "INFO" "Router PID: $ROUTER_PID (logs: $ROUTER_LOG_FILE)" + + if ! wait_for_health; then + cleanup_router + die "Router did not become healthy" + fi + + log "INFO" "Router started successfully. Press Ctrl+C to stop." + wait "$ROUTER_PID" +} + +main() { + parse_args "$@" + require_commands + build_assets + + if [[ "$SKIP_START" == true ]]; then + log "INFO" "Skipping router startup; quickstart build complete" + exit 0 + fi + + trap 'cleanup_router; exit 130' INT + trap 'cleanup_router; exit 143' TERM + trap 'cleanup_router' EXIT + + start_router +} + +main "$@" diff --git a/examples/quickstart/sample-data/README.md b/examples/quickstart/sample-data/README.md new file mode 100644 index 00000000..94963d2b --- /dev/null +++ b/examples/quickstart/sample-data/README.md @@ -0,0 +1,24 @@ +# Sample Data + +Trimmed datasets for the quickstart flow live here. They mirror the schema consumed by +`bench/vllm_semantic_router_bench` so we can swap them in for the heavy upstream corpora. + +## Available slices +- `mmlu-sample.jsonl` – 10 representative questions spanning multiple disciplines. +- `arc-sample.jsonl` – 10 ARC-style science questions with multiple-choice options. + +Each line is a JSON object with the following fields: +- `question_id`: stable identifier used in reports. +- `category`: benchmark category or sub-domain. +- `question`: user prompt text. +- `options`: either a list (MMLU) or mapping (ARC) of answer choices. +- `answer`: canonical correct answer (letter). +- `cot_content` *(optional)*: short reasoning snippet for CoT prompts. + +Files intentionally stay under 20 KB so they can ship with the repository. + +## Integration plan +`quick-eval.sh` will soon surface a `QUICKSTART_SAMPLE_ROOT` override. When present, +the benchmark runner will load these JSONL files instead of fetching full datasets +from Hugging Face. Until that wire-up lands, the JSONL fixtures exist here to unblock +concurrent work (documentation, UI previews, and benchmarking harness updates). diff --git a/examples/quickstart/sample-data/arc-sample.jsonl b/examples/quickstart/sample-data/arc-sample.jsonl new file mode 100644 index 00000000..cd94c34b --- /dev/null +++ b/examples/quickstart/sample-data/arc-sample.jsonl @@ -0,0 +1,10 @@ +{"question_id":"dev-arc-001","category":"physical science","question":"Which phase change occurs when a solid turns directly into a gas?","options":{"A":"Condensation","B":"Sublimation","C":"Deposition","D":"Evaporation"},"answer":"B","cot_content":"Solid to gas without liquid is sublimation."} +{"question_id":"dev-arc-002","category":"earth science","question":"What causes day and night on Earth?","options":{"A":"Earth's revolution around the Sun","B":"Earth's rotation on its axis","C":"Moon's gravitational pull","D":"Seasonal tilt"},"answer":"B","cot_content":"Earth spins on its axis every 24 hours creating day/night."} +{"question_id":"dev-arc-003","category":"life science","question":"Which structure transports water in vascular plants?","options":{"A":"Phloem","B":"Xylem","C":"Stomata","D":"Chloroplast"},"answer":"B","cot_content":"Xylem carries water and minerals upward."} +{"question_id":"dev-arc-004","category":"physical science","question":"An object with a mass of 5 kg is pushed with a force of 20 N. What is its acceleration?","options":{"A":"1 m/s^2","B":"2 m/s^2","C":"4 m/s^2","D":"5 m/s^2"},"answer":"C","cot_content":"F=ma => a = 20/5 = 4 m/s²."} +{"question_id":"dev-arc-005","category":"life science","question":"Which organ system is responsible for transporting nutrients and oxygen throughout the body?","options":{"A":"Digestive","B":"Circulatory","C":"Respiratory","D":"Nervous"},"answer":"B","cot_content":"Circulatory system moves blood carrying nutrients and oxygen."} +{"question_id":"dev-arc-006","category":"earth science","question":"What is the primary cause of the changing seasons?","options":{"A":"Earth's distance from the Sun","B":"Earth's axial tilt","C":"Solar flares","D":"Ocean currents"},"answer":"B","cot_content":"Tilt causes different hemispheres to receive varying sunlight."} +{"question_id":"dev-arc-007","category":"energy","question":"Which energy transformation occurs in a flashlight when it is switched on?","options":{"A":"Electrical to light","B":"Mechanical to electrical","C":"Chemical to mechanical","D":"Thermal to electrical"},"answer":"A","cot_content":"Battery provides electrical energy converted to light."} +{"question_id":"dev-arc-008","category":"physical science","question":"What force keeps the planets in orbit around the Sun?","options":{"A":"Electromagnetic","B":"Frictional","C":"Gravitational","D":"Nuclear"},"answer":"C","cot_content":"Gravity provides the centripetal force for orbits."} +{"question_id":"dev-arc-009","category":"life science","question":"Which part of the cell contains genetic material?","options":{"A":"Mitochondria","B":"Cell membrane","C":"Nucleus","D":"Ribosome"},"answer":"C","cot_content":"DNA is stored inside the nucleus."} +{"question_id":"dev-arc-010","category":"earth science","question":"What instrument measures atmospheric pressure?","options":{"A":"Thermometer","B":"Barometer","C":"Hygrometer","D":"Anemometer"},"answer":"B","cot_content":"Barometer readings track air pressure."} diff --git a/examples/quickstart/sample-data/mmlu-sample.jsonl b/examples/quickstart/sample-data/mmlu-sample.jsonl new file mode 100644 index 00000000..408cb95f --- /dev/null +++ b/examples/quickstart/sample-data/mmlu-sample.jsonl @@ -0,0 +1,10 @@ +{"question_id":"dev-mmlu-001","category":"business","question":"A firm increases production from 100 to 120 units while total costs rise from $200 to $230. What is the marginal cost per unit?","options":["$1.50","$2.50","$3.00","$15.00"],"answer":"B","cot_content":"Marginal cost = change in total cost / change in quantity = 30 / 20 = 1.5"} +{"question_id":"dev-mmlu-002","category":"biology","question":"Which organelle is primarily responsible for ATP production in eukaryotic cells?","options":["Nucleus","Endoplasmic reticulum","Mitochondrion","Golgi apparatus"],"answer":"C","cot_content":"ATP synthesis occurs in mitochondria via oxidative phosphorylation."} +{"question_id":"dev-mmlu-003","category":"computer science","question":"In Big-O notation, the time complexity of binary search on a sorted array is?","options":["O(n)","O(log n)","O(n log n)","O(1)"],"answer":"B","cot_content":"Binary search halves the search space each step, giving logarithmic complexity."} +{"question_id":"dev-mmlu-004","category":"law","question":"Which principle requires courts to follow precedents set by higher courts?","options":["Mens rea","Stare decisis","Ultra vires","Res ipsa loquitur"],"answer":"B","cot_content":"Stare decisis compels adherence to higher court decisions."} +{"question_id":"dev-mmlu-005","category":"mathematics","question":"Evaluate the derivative of f(x)=3x^2+4x-5.","options":["3x^2","6x+4","3x+4","6x"],"answer":"B","cot_content":"Derivative is 6x + 4."} +{"question_id":"dev-mmlu-006","category":"psychology","question":"Which psychologist is associated with the hierarchy of needs?","options":["Carl Rogers","Sigmund Freud","Abraham Maslow","B.F. Skinner"],"answer":"C","cot_content":"Maslow proposed the hierarchy of needs."} +{"question_id":"dev-mmlu-007","category":"economics","question":"If the price elasticity of demand is -2, a 5% price increase results in what approximate change in quantity demanded?","options":["Increase 2.5%","Decrease 10%","Decrease 2.5%","Increase 10%"],"answer":"B","cot_content":"Elasticity * price change = -2 * 5% = -10%."} +{"question_id":"dev-mmlu-008","category":"physics","question":"What is the SI unit of electric resistance?","options":["Tesla","Ohm","Farad","Weber"],"answer":"B","cot_content":"Resistance is measured in ohms."} +{"question_id":"dev-mmlu-009","category":"medicine","question":"Which vitamin deficiency causes scurvy?","options":["Vitamin A","Vitamin B12","Vitamin C","Vitamin D"],"answer":"C","cot_content":"Vitamin C deficiency leads to scurvy."} +{"question_id":"dev-mmlu-010","category":"history","question":"The Marshall Plan aimed to rebuild which region after WWII?","options":["South America","Eastern Asia","Western Europe","Middle East"],"answer":"C","cot_content":"The Marshall Plan targeted Western European recovery."} diff --git a/examples/quickstart/templates/README.md b/examples/quickstart/templates/README.md new file mode 100644 index 00000000..a6dcbbb7 --- /dev/null +++ b/examples/quickstart/templates/README.md @@ -0,0 +1,7 @@ +# Quickstart Templates + +This folder will include Markdown or YAML templates consumed by quickstart scripts, such as: +- `report-template.md` for summarizing benchmark output. +- `config-template.yaml` containing reusable sections shared across quickstart configs. + +Add concrete templates once reporting format is finalized.