diff --git a/.changeset/0000-export-command-change.md b/.changeset/0000-export-command-change.md new file mode 100644 index 00000000..12562b8b --- /dev/null +++ b/.changeset/0000-export-command-change.md @@ -0,0 +1,5 @@ +--- +"evalite": major +--- + +Export command now uses the storage specified in the config and auto-runs if empty. diff --git a/.changeset/0000-in-memory-default.md b/.changeset/0000-in-memory-default.md new file mode 100644 index 00000000..d5bb9e51 --- /dev/null +++ b/.changeset/0000-in-memory-default.md @@ -0,0 +1,5 @@ +--- +"evalite": major +--- + +Changed default storage to in-memory. SQLite still available via config. diff --git a/.changeset/0000-remove-streaming.md b/.changeset/0000-remove-streaming.md new file mode 100644 index 00000000..1c2835a6 --- /dev/null +++ b/.changeset/0000-remove-streaming.md @@ -0,0 +1,5 @@ +--- +"evalite": minor +--- + +Removed streaming text support from tasks. Process streams before returning from task() (e.g., await result.text for AI SDK). diff --git a/.changeset/0234-auto-dotenv-support.md b/.changeset/0234-auto-dotenv-support.md new file mode 100644 index 00000000..97589b1b --- /dev/null +++ b/.changeset/0234-auto-dotenv-support.md @@ -0,0 +1,5 @@ +--- +"evalite": minor +--- + +Support .env files by default via dotenv/config. Environment variables from .env files are now automatically loaded without any configuration needed. Users no longer need to manually add `setupFiles: ["dotenv/config"]` to their evalite.config.ts. diff --git a/.changeset/long-olives-give.md b/.changeset/long-olives-give.md new file mode 100644 index 00000000..6fe11fd6 --- /dev/null +++ b/.changeset/long-olives-give.md @@ -0,0 +1,5 @@ +--- +"evalite": major +--- + +Moved storage API from evals -> suites, results -> evals. This will likely cause issues for existing SQLite databases when released, so will need migration. diff --git a/.changeset/real-phones-join.md b/.changeset/real-phones-join.md new file mode 100644 index 00000000..c6b27aab --- /dev/null +++ b/.changeset/real-phones-join.md @@ -0,0 +1,5 @@ +--- +"evalite-ui": patch +--- + +Added an overlay to the backdrop when viewing a trace diff --git a/.changeset/wet-clocks-camp.md b/.changeset/wet-clocks-camp.md new file mode 100644 index 00000000..b833f033 --- /dev/null +++ b/.changeset/wet-clocks-camp.md @@ -0,0 +1,5 @@ +--- +"evalite-ui": minor +--- + +Add the ability to search and filter evals in the UI diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..ee4abc02 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +# http://editorconfig.org + +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false + +[*.py] +indent_size = 4 + +[Makefile] +indent_style = tabs +indent_size = 2 diff --git a/apps/evalite-docs/astro.config.mts b/apps/evalite-docs/astro.config.mts index 5a80e872..8bc8f654 100644 --- a/apps/evalite-docs/astro.config.mts +++ b/apps/evalite-docs/astro.config.mts @@ -3,6 +3,27 @@ import starlight from "@astrojs/starlight"; // https://astro.build/config export default defineConfig({ + redirects: { + "/what-is-evalite": "/guides/what-is-evalite", + "/quickstart": "/guides/quickstart", + "/guides/environment-variables": "/guides/configuration", + "/guides/skipping": "/tips/only-run-certain-evals", + "/guides/customizing-the-ui": "/tips/customize-the-ui", + "/guides/ci": "/tips/run-evals-on-ci-cd", + "/guides/running-programmatically": "/tips/run-evals-programmatically", + "/examples/ai-sdk": "/tips/vercel-ai-sdk", + "/guides/traces": "/tips/adding-traces", + "/guides/variant-comparison": "/tips/comparing-different-approaches", + "/guides/multi-modal": "/tips/images-and-media", + "/guides/cli": "/tips/watch-mode", + "/tips/skip-evals-during-development": "/tips/only-run-certain-evals", + "/tips/track-individual-llm-calls": "/tips/adding-traces", + "/tips/integrate-with-ai-sdk": "/tips/vercel-ai-sdk", + "/tips/work-with-images-and-media": "/tips/images-and-media", + "/tips/use-watch-mode-effectively": "/tips/watch-mode", + "/tips/set-score-thresholds": "/tips/score-thresholds", + "/tips/run-specific-eval-files": "/tips/only-run-certain-evals", + }, integrations: [ starlight({ title: "Evalite", @@ -72,75 +93,112 @@ export default defineConfig({ }, sidebar: [ { - label: "Getting Started", + label: "Guides", items: [ { label: "What Is Evalite?", - slug: "what-is-evalite", + slug: "guides/what-is-evalite", }, { label: "Quickstart", - slug: "quickstart", + slug: "guides/quickstart", + }, + { + label: "Scorers", + slug: "guides/scorers", + }, + { + label: "Configuration", + slug: "guides/configuration", }, ], }, { - label: "Guides", + label: "Tips", items: [ { - label: "Environment Variables", - slug: "guides/environment-variables", + label: "Only Run Certain Evals", + slug: "tips/only-run-certain-evals", }, { - label: "Scorers", - slug: "guides/scorers", + label: "Customize The UI", + slug: "tips/customize-the-ui", }, { - label: "Traces", - slug: "guides/traces", + label: "CI/CD", + slug: "tips/run-evals-on-ci-cd", }, { - label: "A/B Testing", - slug: "guides/variant-comparison", + label: "Adding Traces", + slug: "tips/adding-traces", }, { - label: "Multi-Modal", - slug: "guides/multi-modal", + label: "Vercel AI SDK", + slug: "tips/vercel-ai-sdk", }, { - label: "Configuration", - slug: "guides/configuration", + label: "Comparing Different Approaches", + slug: "tips/comparing-different-approaches", + }, + { + label: "Run Evals Programmatically", + slug: "tips/run-evals-programmatically", + }, + { + label: "Images And Media", + slug: "tips/images-and-media", + }, + { + label: "Run Same Eval Multiple Times", + slug: "tips/run-same-eval-multiple-times", + }, + { + label: "Watch Mode", + slug: "tips/watch-mode", }, { - label: "Streams", - slug: "guides/streams", + label: "Score Thresholds", + slug: "tips/score-thresholds", + }, + ], + }, + { + label: "Reference", + items: [ + { + label: "evalite()", + slug: "api/evalite", }, { label: "CLI", - slug: "guides/cli", + slug: "api/cli", }, { - label: "Running Programmatically", - slug: "guides/running-programmatically", + label: "defineConfig()", + slug: "api/define-config", }, { - label: "CI/CD", - slug: "guides/ci", + label: "createScorer()", + slug: "api/create-scorer", }, { - label: "Skipping Evals", - slug: "guides/skipping", + label: "EvaliteFile", + slug: "api/evalite-file", + }, + { + label: "Traces", + slug: "api/traces", }, { - label: "Customizing The UI", - slug: "guides/customizing-the-ui", + label: "runEvalite()", + slug: "api/run-evalite", + }, + { + label: "Storage", + slug: "api/storage", }, ], }, - { - label: "Integrations", - items: [{ label: "Vercel AI SDK", slug: "examples/ai-sdk" }], - }, ], }), ], diff --git a/apps/evalite-docs/src/content/docs/api/cli.mdx b/apps/evalite-docs/src/content/docs/api/cli.mdx new file mode 100644 index 00000000..24a51109 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/cli.mdx @@ -0,0 +1,177 @@ +--- +title: CLI +--- + +The `evalite` command-line interface for running evaluations. + +## Commands + +### `evalite` (default) + +Alias for `evalite run`. Runs evals once and exits. + +```bash +evalite +``` + +### `evalite run` + +Run evals once and exit. Default command when no subcommand specified. + +```bash +evalite run +evalite run path/to/eval.eval.ts +``` + +**Positional Arguments:** + +- `[path]` (optional) - Path filter to run specific eval files. If not provided, runs all `.eval.ts` files. + +**Flags:** + +- `--threshold ` - Fails the process if the score is below threshold. Specified as 0-100. Default is 100. +- `--outputPath ` - Path to write test results in JSON format after evaluation completes. +- `--hideTable` - Hides the detailed table output in the CLI. + +**Examples:** + +```bash +# Run all evals +evalite run + +# Run specific eval file +evalite run example.eval.ts + +# Fail if score drops below 80% +evalite run --threshold 80 + +# Export results to JSON +evalite run --outputPath results.json + +# Hide detailed table +evalite run --hideTable +``` + +### `evalite watch` + +Watch evals for file changes and re-run automatically. Starts the UI server at `http://localhost:3006`. + +```bash +evalite watch +evalite watch path/to/eval.eval.ts +``` + +**Positional Arguments:** + +- `[path]` (optional) - Path filter to watch specific eval files. + +**Flags:** + +- `--threshold ` - Fails the process if the score is below threshold. Specified as 0-100. Default is 100. +- `--hideTable` - Hides the detailed table output in the CLI. + +**Note:** `--outputPath` is not supported in watch mode. + +**Examples:** + +```bash +# Watch all evals +evalite watch + +# Watch specific eval +evalite watch example.eval.ts + +# Watch with hidden table (useful for debugging with console.log) +evalite watch --hideTable +``` + +### `evalite serve` + +Run evals once and serve the UI without watching for changes. Useful when evals take a long time to run. + +```bash +evalite serve +evalite serve path/to/eval.eval.ts +``` + +**Positional Arguments:** + +- `[path]` (optional) - Path filter to run specific eval files. + +**Flags:** + +- `--threshold ` - Fails the process if the score is below threshold. Specified as 0-100. Default is 100. +- `--outputPath ` - Path to write test results in JSON format after evaluation completes. +- `--hideTable` - Hides the detailed table output in the CLI. + +**Examples:** + +```bash +# Run once and serve UI +evalite serve + +# Serve specific eval results +evalite serve example.eval.ts +``` + +### `evalite export` + +Export static UI bundle for CI artifacts. Exports a standalone HTML bundle that can be viewed offline or uploaded as a CI artifact. + +```bash +evalite export +``` + +**Flags:** + +- `--output ` - Output directory for static export. Default: `./evalite-export` +- `--runId ` - Specific run ID to export. Default: latest run + +**Examples:** + +```bash +# Export latest run to default directory +evalite export + +# Export to custom directory +evalite export --output ./my-export + +# Export specific run +evalite export --runId 123 + +# Export and specify both options +evalite export --output ./artifacts --runId 42 +``` + +**Note:** If no runs exist in storage, `evalite export` will automatically run evaluations first. + +## Global Flags + +All commands support these flags: + +- `--help` - Show help for the command +- `--version` - Show version information + +## Configuration + +CLI behavior can be configured via [evalite.config.ts](/api/define-config): + +```typescript +// evalite.config.ts +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + scoreThreshold: 80, // Default threshold for all runs + hideTable: true, // Hide table by default + server: { + port: 3006, // UI server port + }, +}); +``` + +## See Also + +- [runEvalite()](/api/run-evalite) - Run evals programmatically from Node.js +- [defineConfig()](/api/define-config) - Configure Evalite behavior +- [Watch Mode](/tips/watch-mode) - Tips for using watch mode effectively +- [CI/CD](/tips/run-evals-on-ci-cd) - Running evals in continuous integration diff --git a/apps/evalite-docs/src/content/docs/api/create-scorer.mdx b/apps/evalite-docs/src/content/docs/api/create-scorer.mdx new file mode 100644 index 00000000..143429a4 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/create-scorer.mdx @@ -0,0 +1,219 @@ +--- +title: createScorer() +--- + +Create a reusable scorer function for evaluating LLM outputs. + +## Signature + +```typescript +createScorer(opts: { + name: string; + description?: string; + scorer: (input: { + input: TInput; + output: TOutput; + expected?: TExpected; + }) => Promise | number | { score: number; metadata?: unknown }; +}): Scorer +``` + +## Parameters + +### `opts.name` + +**Type:** `string` (required) + +The name of the scorer. Displayed in the UI and test output. + +```typescript +createScorer({ + name: "Exact Match", + scorer: ({ output, expected }) => (output === expected ? 1 : 0), +}); +``` + +### `opts.description` + +**Type:** `string` (optional) + +A description of what the scorer evaluates. Helps document scoring logic. + +```typescript +createScorer({ + name: "Length Check", + description: "Checks if output is at least 10 characters", + scorer: ({ output }) => (output.length >= 10 ? 1 : 0), +}); +``` + +### `opts.scorer` + +**Type:** `(input: { input, output, expected }) => number | { score: number; metadata?: unknown }` + +The scoring function. Receives input, output, and expected values. Must return: + +- A number between 0 and 1, or +- An object with `score` (0-1) and optional `metadata` + +```typescript +createScorer({ + name: "Word Count", + scorer: ({ output }) => { + const wordCount = output.split(" ").length; + return { + score: wordCount >= 10 ? 1 : 0, + metadata: { wordCount }, + }; + }, +}); +``` + +## Return Value + +Returns a `Scorer` function that can be used in the `scorers` array of [evalite()](/api/evalite). + +## Usage + +### Basic Scorer + +```typescript +import { createScorer, evalite } from "evalite"; + +const exactMatch = createScorer({ + name: "Exact Match", + scorer: ({ output, expected }) => { + return output === expected ? 1 : 0; + }, +}); + +evalite("My Eval", { + data: [{ input: "Hello", expected: "Hi" }], + task: async (input) => callLLM(input), + scorers: [exactMatch], +}); +``` + +### Scorer with Metadata + +```typescript +const lengthChecker = createScorer({ + name: "Length Check", + description: "Validates output length is within acceptable range", + scorer: ({ output }) => { + const length = output.length; + const isValid = length >= 10 && length <= 100; + + return { + score: isValid ? 1 : 0, + metadata: { + length, + minLength: 10, + maxLength: 100, + }, + }; + }, +}); +``` + +### Async Scorer + +Scorers can be async for LLM-based evaluation: + +```typescript +const llmScorer = createScorer({ + name: "LLM Judge", + description: "Uses GPT-4 to evaluate output quality", + scorer: async ({ output, expected }) => { + const response = await openai.chat.completions.create({ + model: "gpt-4", + messages: [ + { + role: "system", + content: "Rate the output quality from 0 to 1.", + }, + { + role: "user", + content: `Output: ${output}\nExpected: ${expected}`, + }, + ], + }); + + const score = parseFloat(response.choices[0].message.content); + return score; + }, +}); +``` + +### Reusable Scorers + +Create a library of scorers to reuse across evals: + +```typescript +// scorers.ts +import { createScorer } from "evalite"; + +export const hasEmoji = createScorer({ + name: "Has Emoji", + scorer: ({ output }) => (/\p{Emoji}/u.test(output) ? 1 : 0), +}); + +export const containsKeyword = (keyword: string) => + createScorer({ + name: `Contains "${keyword}"`, + scorer: ({ output }) => (output.includes(keyword) ? 1 : 0), + }); + +// my-eval.eval.ts +import { evalite } from "evalite"; +import { hasEmoji, containsKeyword } from "./scorers"; + +evalite("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => callLLM(input), + scorers: [hasEmoji, containsKeyword("greeting")], +}); +``` + +## Inline Scorers + +You can also define scorers inline without `createScorer()`: + +```typescript +evalite("My Eval", { + data: [{ input: "Hello", expected: "Hi" }], + task: async (input) => callLLM(input), + scorers: [ + // Inline scorer (same shape as createScorer opts) + { + name: "Exact Match", + scorer: ({ output, expected }) => (output === expected ? 1 : 0), + }, + ], +}); +``` + +Both approaches are equivalent. Use `createScorer()` when you want to reuse the scorer across multiple evals. + +## Using Third-Party Scorers + +Evalite is compatible with scorers from [autoevals](https://github.com/braintrustdata/autoevals): + +```typescript +import { evalite } from "evalite"; +import { Levenshtein, Factuality } from "autoevals"; + +evalite("My Eval", { + data: [{ input: "Hello", expected: "Hi there!" }], + task: async (input) => callLLM(input), + scorers: [ + Levenshtein, // String similarity + Factuality, // Fact checking + ], +}); +``` + +## See Also + +- [Scorers Guide](/guides/scorers) - Overview of scoring strategies +- [evalite()](/api/evalite) - Using scorers in evals diff --git a/apps/evalite-docs/src/content/docs/api/define-config.mdx b/apps/evalite-docs/src/content/docs/api/define-config.mdx new file mode 100644 index 00000000..99b3f5a0 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/define-config.mdx @@ -0,0 +1,219 @@ +--- +title: defineConfig() +--- + +Type-safe helper for defining Evalite configuration in `evalite.config.ts`. + +## Signature + +```typescript +defineConfig(config: { + storage?: () => Evalite.Storage | Promise; + server?: { + port?: number; + }; + scoreThreshold?: number; + hideTable?: boolean; + testTimeout?: number; + maxConcurrency?: number; + trialCount?: number; + setupFiles?: string[]; +}): Evalite.Config +``` + +## Usage + +Create an `evalite.config.ts` file in your project root: + +```typescript +// evalite.config.ts +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + testTimeout: 60000, + maxConcurrency: 100, + scoreThreshold: 80, +}); +``` + +## Options + +### `storage` + +**Type:** `() => Evalite.Storage | Promise` + +**Default:** In-memory storage (data lost after process exits) + +Factory function to create a custom storage backend. Use `createSqliteStorage()` for persistent storage. + +```typescript +import { defineConfig } from "evalite/config"; +import { createSqliteStorage } from "evalite/sqlite-storage"; + +export default defineConfig({ + storage: () => createSqliteStorage("./custom.db"), +}); +``` + +See [Storage](/api/storage) for more details. + +### `server.port` + +**Type:** `number` + +**Default:** `3006` + +Port for the Evalite UI server. + +```typescript +export default defineConfig({ + server: { + port: 8080, + }, +}); +``` + +### `scoreThreshold` + +**Type:** `number` (0-100) + +**Default:** `100` + +Minimum average score threshold. If the average score falls below this threshold, the process will exit with code 1. + +```typescript +export default defineConfig({ + scoreThreshold: 80, // Fail if average score < 80 +}); +``` + +Useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold. + +### `hideTable` + +**Type:** `boolean` + +**Default:** `false` + +Hide the detailed results table in terminal output. Keeps the score summary but removes the detailed table. + +```typescript +export default defineConfig({ + hideTable: true, +}); +``` + +Useful when debugging with `console.log` to see logs more clearly. + +### `testTimeout` + +**Type:** `number` (milliseconds) + +**Default:** `30000` (30 seconds) + +Maximum time a test can run before timing out. + +```typescript +export default defineConfig({ + testTimeout: 60000, // 60 seconds +}); +``` + +### `maxConcurrency` + +**Type:** `number` + +**Default:** `5` + +Maximum number of test cases to run in parallel. + +```typescript +export default defineConfig({ + maxConcurrency: 100, // Run up to 100 tests in parallel +}); +``` + +Useful for optimizing performance and managing API rate limits. + +### `trialCount` + +**Type:** `number` + +**Default:** `1` + +Number of times to run each test case. Useful for measuring variance in non-deterministic evaluations. + +```typescript +export default defineConfig({ + trialCount: 3, // Run each test case 3 times +}); +``` + +Can also be set per-eval in the [evalite()](/api/evalite) function. + +### `setupFiles` + +**Type:** `string[]` + +**Default:** `[]` + +Array of file paths to run before tests. Useful for loading custom environment setup. + +```typescript +export default defineConfig({ + setupFiles: ["./custom-setup.ts"], +}); +``` + +**Note:** `.env` files are loaded automatically via `dotenv/config` - no need to configure them here. + +## Complete Example + +```typescript +// evalite.config.ts +import { defineConfig } from "evalite/config"; +import { createSqliteStorage } from "evalite/sqlite-storage"; + +export default defineConfig({ + // Persistent storage + storage: () => createSqliteStorage("./evalite.db"), + + // Server configuration + server: { + port: 3006, + }, + + // Quality threshold + scoreThreshold: 75, + + // Test execution + testTimeout: 60000, + maxConcurrency: 50, + trialCount: 1, + + // UI preferences + hideTable: false, + + // Setup + setupFiles: ["./test-setup.ts"], +}); +``` + +## Supported File Names + +Evalite will look for configuration in these files (in order): + +- `evalite.config.ts` +- `evalite.config.mts` +- `evalite.config.js` +- `evalite.config.mjs` + +## Vitest Integration + +Since Evalite is built on Vitest, you can also use `vitest.config.ts` for backward compatibility. However, `evalite.config.ts` is the recommended approach and takes precedence when both files exist. + +## See Also + +- [Configuration Guide](/guides/configuration) - Overview of configuration options +- [Storage](/api/storage) - Custom storage backends +- [CLI](/api/cli) - Command-line flags that override config diff --git a/apps/evalite-docs/src/content/docs/api/evalite-file.mdx b/apps/evalite-docs/src/content/docs/api/evalite-file.mdx new file mode 100644 index 00000000..2cc15a0e --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/evalite-file.mdx @@ -0,0 +1,246 @@ +--- +title: EvaliteFile +--- + +Utilities for working with images, audio, video, and other media files in evaluations. + +## Overview + +`EvaliteFile` provides methods for referencing files in your evals. Evalite automatically handles file storage and display in the UI. + +## Methods + +### `EvaliteFile.fromPath()` + +Reference a file on disk without loading it into memory. + +**Signature:** + +```typescript +EvaliteFile.fromPath(path: string): Evalite.File +``` + +**Parameters:** + +- `path` - File path relative to your project root or absolute path + +**Returns:** An `Evalite.File` object that can be used in data, task outputs, traces, or columns. + +**Example:** + +```typescript +import { evalite, EvaliteFile } from "evalite"; + +evalite("Image Analysis", { + data: [ + { + input: EvaliteFile.fromPath("./images/cat.jpg"), + expected: "A cat sitting on a couch", + }, + ], + task: async (input) => { + console.log(input.path); // "./images/cat.jpg" + + // Use the file path with your LLM + const response = await analyzeLLM(input.path); + return response; + }, + scorers: [], +}); +``` + +### `EvaliteFile.isEvaliteFile()` + +Check if a value is an `Evalite.File` object. + +**Signature:** + +```typescript +EvaliteFile.isEvaliteFile(value: unknown): value is Evalite.File +``` + +**Example:** + +```typescript +import { EvaliteFile } from "evalite"; + +const file = EvaliteFile.fromPath("./image.jpg"); +console.log(EvaliteFile.isEvaliteFile(file)); // true +console.log(EvaliteFile.isEvaliteFile("./image.jpg")); // false +``` + +## Automatic File Detection + +Evalite automatically detects and handles `Uint8Array` (Buffer) objects without requiring `EvaliteFile`: + +```typescript +import { evalite } from "evalite"; +import { readFileSync } from "fs"; + +evalite("Image Eval", { + data: [ + { + // Evalite automatically handles Buffers + input: readFileSync("./image.jpg"), + expected: readFileSync("./expected.jpg"), + }, + ], + task: async (input) => { + // Return a Buffer - Evalite handles it automatically + return readFileSync("./output.jpg"); + }, + scorers: [], +}); +``` + +When Evalite detects a `Uint8Array`, it: + +1. Saves the file to `./node_modules/.evalite/files/` +2. References the cached file in the UI +3. Displays the file based on its type (image, audio, video, etc.) + +## Usage in Different Contexts + +### In Data (Input/Expected) + +```typescript +evalite("My Eval", { + data: [ + { + input: EvaliteFile.fromPath("./input.jpg"), + expected: EvaliteFile.fromPath("./expected.jpg"), + }, + ], + task: async (input) => { + // ... + }, +}); +``` + +### In Task Output + +```typescript +evalite("My Eval", { + data: [{ input: "Generate an image" }], + task: async (input) => { + const imageBuffer = await generateImage(input); + // Return Buffer or EvaliteFile + return imageBuffer; // Automatically handled + }, +}); +``` + +### In Traces + +```typescript +import { reportTrace } from "evalite/traces"; + +evalite("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => { + const imageInput = readFileSync("./input.jpg"); + + reportTrace({ + input: imageInput, // File in trace + output: "Analysis complete", + }); + + return "Done"; + }, +}); +``` + +### In Columns + +```typescript +evalite("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => { + return "Output"; + }, + columns: () => [ + { + label: "Debug Image", + value: EvaliteFile.fromPath("./debug.jpg"), + }, + ], +}); +``` + +## When to Use fromPath() vs Buffers + +**Use `EvaliteFile.fromPath()` when:** + +- File is already on disk +- Want to avoid loading large files into memory +- Need to reference the file path in your task + +**Use Buffers (automatic detection) when:** + +- File is generated in memory +- File comes from an API response +- Working with base64 or other in-memory formats + +## Complete Example + +```typescript +import { evalite, EvaliteFile } from "evalite"; +import { readFileSync } from "fs"; +import { reportTrace } from "evalite/traces"; + +evalite("Multi-Modal Analysis", { + data: async () => { + return [ + { + // Mix of file references and buffers + input: { + image: EvaliteFile.fromPath("./images/cat.jpg"), + audio: readFileSync("./audio/meow.mp3"), + }, + expected: "A cat meowing", + }, + ]; + }, + task: async (input) => { + // Trace with file + reportTrace({ + input: input.image, + output: "Processing...", + }); + + const result = await analyzeMultiModal(input); + + return result; + }, + columns: ({ output }) => [ + { + label: "Visualization", + value: readFileSync("./viz.png"), + }, + ], + scorers: [ + { + name: "Match", + scorer: ({ output, expected }) => { + return output === expected ? 1 : 0; + }, + }, + ], +}); +``` + +## File Storage + +All files (whether from `EvaliteFile.fromPath()` or auto-detected Buffers) are stored in: + +``` +./node_modules/.evalite/files/ +``` + +This cache is gitignored by default. Files are referenced by content hash to avoid duplicates. + +## See Also + +- [Images and Media Guide](/tips/images-and-media) - Working with multi-modal data +- [evalite()](/api/evalite) - Main evaluation function +- [Traces](/api/traces) - Adding traces to track nested calls diff --git a/apps/evalite-docs/src/content/docs/api/evalite.mdx b/apps/evalite-docs/src/content/docs/api/evalite.mdx new file mode 100644 index 00000000..4fa8a819 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/evalite.mdx @@ -0,0 +1,230 @@ +--- +title: evalite() +--- + +The main function for defining evaluations in `.eval.ts` files. + +## Signature + +```typescript +evalite( + evalName: string, + opts: { + data: Array<{ input: TInput; expected?: TExpected; only?: boolean }> + | (() => Promise>); + task: (input: TInput) => Promise | TOutput; + scorers?: Array | ScorerOpts>; + columns?: (opts: { input: TInput; output: TOutput; expected?: TExpected }) => + Promise> | + Array<{ label: string; value: unknown }>; + trialCount?: number; + } +): void +``` + +## Parameters + +### `evalName` + +**Type:** `string` + +The name of your evaluation. This appears in the UI and test output. + +```typescript +evalite("Greeting Generator", { + // ... +}); +``` + +### `opts.data` + +**Type:** `Array<{ input: TInput; expected?: TExpected; only?: boolean }>` or `() => Promise>` + +The dataset for your evaluation. Each item becomes a separate test case. + +Can be an array or an async function that returns an array. + +```typescript +// Static array +evalite("My Eval", { + data: [ + { input: "Hello", expected: "Hi there!" }, + { input: "Goodbye", expected: "See you later!" }, + ], + // ... +}); + +// Async function +evalite("My Eval", { + data: async () => { + const dataset = await fetch("/api/dataset").then((r) => r.json()); + return dataset; + }, + // ... +}); +``` + +**`only` flag:** Mark specific data points to run exclusively during development: + +```typescript +evalite("My Eval", { + data: [ + { input: "test1", only: true }, // Only this will run + { input: "test2" }, + { input: "test3" }, + ], + // ... +}); +``` + +### `opts.task` + +**Type:** `(input: TInput) => Promise | TOutput` + +The function to test. Receives input from data, returns output to be scored. + +```typescript +evalite("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => { + const response = await openai.chat.completions.create({ + model: "gpt-4", + messages: [{ role: "user", content: input }], + }); + return response.choices[0].message.content; + }, + // ... +}); +``` + +### `opts.scorers` + +**Type:** `Array` (optional) + +Functions that evaluate the output quality. Each scorer returns a score between 0 and 1. + +```typescript +evalite("My Eval", { + data: [{ input: "Hello", expected: "Hi" }], + task: async (input) => callLLM(input), + scorers: [ + // Inline scorer + { + name: "Exact Match", + scorer: ({ output, expected }) => { + return output === expected ? 1 : 0; + }, + }, + // Using createScorer + createScorer({ + name: "Length Check", + scorer: ({ output }) => { + return output.length > 10 ? 1 : 0; + }, + }), + ], +}); +``` + +See [createScorer()](/api/create-scorer) for more details. + +### `opts.columns` + +**Type:** `(opts: { input, output, expected }) => Promise> | Array<{ label, value }>` (optional) + +Custom columns to display in the UI alongside input/output/expected. + +```typescript +evalite("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => callLLM(input), + columns: ({ output }) => [ + { label: "Word Count", value: output.split(" ").length }, + { label: "Has Emoji", value: /\p{Emoji}/u.test(output) }, + ], +}); +``` + +### `opts.trialCount` + +**Type:** `number` (optional, default: `1`) + +Number of times to run each test case. Useful for measuring variance in non-deterministic evaluations. + +```typescript +evalite("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => callLLM(input), + trialCount: 5, // Run each data point 5 times +}); +``` + +Can also be set globally in [defineConfig()](/api/define-config). + +## Methods + +### `evalite.skip()` + +Skip an entire evaluation. + +```typescript +evalite.skip("My Eval", { + data: [{ input: "Hello" }], + task: async (input) => callLLM(input), +}); +``` + +### `evalite.each()` + +Run the same evaluation with different variants (e.g., comparing models or prompts). + +```typescript +evalite.each([ + { name: "gpt-4", input: "gpt-4" }, + { name: "gpt-3.5-turbo", input: "gpt-3.5-turbo" }, +])("Model Comparison", { + data: [{ input: "Hello" }], + task: async (input, model) => { + const response = await openai.chat.completions.create({ + model, + messages: [{ role: "user", content: input }], + }); + return response.choices[0].message.content; + }, +}); +``` + +See [Comparing Different Approaches](/tips/comparing-different-approaches) for more details. + +## Example + +```typescript +// example.eval.ts +import { evalite } from "evalite"; +import { Levenshtein } from "autoevals"; + +evalite("Greeting Generator", { + data: async () => { + return [ + { input: "Hello", expected: "Hi there!" }, + { input: "Good morning", expected: "Good morning to you!" }, + { input: "Howdy", expected: "Howdy partner!" }, + ]; + }, + task: async (input) => { + const response = await openai.chat.completions.create({ + model: "gpt-4", + messages: [ + { + role: "system", + content: "Generate a friendly greeting response.", + }, + { role: "user", content: input }, + ], + }); + return response.choices[0].message.content; + }, + scorers: [Levenshtein], + columns: ({ output }) => [{ label: "Length", value: output.length }], +}); +``` diff --git a/apps/evalite-docs/src/content/docs/api/run-evalite.mdx b/apps/evalite-docs/src/content/docs/api/run-evalite.mdx new file mode 100644 index 00000000..c5d8f3fb --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/run-evalite.mdx @@ -0,0 +1,298 @@ +--- +title: runEvalite() +--- + +Run evaluations programmatically from Node.js scripts or custom tooling. + +## Signature + +```typescript +runEvalite(opts: { + mode: "run-once-and-exit" | "watch-for-file-changes" | "run-once-and-serve"; + path?: string; + cwd?: string; + scoreThreshold?: number; + outputPath?: string; + hideTable?: boolean; + storage?: Evalite.Storage; +}): Promise +``` + +## Parameters + +### `opts.mode` + +**Type:** `"run-once-and-exit" | "watch-for-file-changes" | "run-once-and-serve"` (required) + +The execution mode for running evals. + +**Modes:** + +- `"run-once-and-exit"` - Run evals once and exit. Ideal for CI/CD pipelines. +- `"watch-for-file-changes"` - Watch for file changes and re-run automatically. Starts the UI server. +- `"run-once-and-serve"` - Run evals once and serve the UI without watching for changes. + +```typescript +import { runEvalite } from "evalite/runner"; + +// CI/CD mode +await runEvalite({ + mode: "run-once-and-exit", +}); + +// Development mode with watch +await runEvalite({ + mode: "watch-for-file-changes", +}); + +// Run once and keep UI open +await runEvalite({ + mode: "run-once-and-serve", +}); +``` + +### `opts.path` + +**Type:** `string` (optional) + +Path filter to run specific eval files. If not provided, runs all `.eval.ts` files. + +```typescript +await runEvalite({ + mode: "run-once-and-exit", + path: "my-eval.eval.ts", +}); +``` + +### `opts.cwd` + +**Type:** `string` (optional) + +The working directory to run evals from. Defaults to `process.cwd()`. + +```typescript +await runEvalite({ + mode: "run-once-and-exit", + cwd: "/path/to/my/project", +}); +``` + +### `opts.scoreThreshold` + +**Type:** `number` (optional, 0-100) + +Minimum average score threshold. If the average score falls below this threshold, the process will exit with code 1. + +```typescript +await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 80, // Fail if average score < 80 +}); +``` + +Useful for CI/CD pipelines where you want to fail the build if evals don't meet a quality threshold. + +### `opts.outputPath` + +**Type:** `string` (optional) + +Path to write test results in JSON format after evaluation completes. + +```typescript +await runEvalite({ + mode: "run-once-and-exit", + outputPath: "./results.json", +}); +``` + +The exported JSON contains the complete run data including all evals, results, scores, and traces. + +**Note:** Not supported in `watch-for-file-changes` mode. + +### `opts.hideTable` + +**Type:** `boolean` (optional, default: `false`) + +Hide the detailed results table in terminal output. Keeps the score summary but removes the detailed table. + +```typescript +await runEvalite({ + mode: "watch-for-file-changes", + hideTable: true, // Useful for debugging with console.log +}); +``` + +### `opts.storage` + +**Type:** `Evalite.Storage` (optional) + +Custom storage backend instance. If not provided, uses the storage from `evalite.config.ts` or defaults to in-memory storage. + +```typescript +import { runEvalite } from "evalite/runner"; +import { createSqliteStorage } from "evalite/sqlite-storage"; + +await runEvalite({ + mode: "run-once-and-exit", + storage: createSqliteStorage("./custom.db"), +}); +``` + +See [Storage](/api/storage) for more details. + +## Usage Examples + +### Basic CI/CD Script + +```typescript +import { runEvalite } from "evalite/runner"; + +async function runTests() { + try { + await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 75, + outputPath: "./results.json", + }); + console.log("All evals passed!"); + } catch (error) { + console.error("Evals failed:", error); + process.exit(1); + } +} + +runTests(); +``` + +### Development Script + +```typescript +import { runEvalite } from "evalite/runner"; + +// Run specific eval in watch mode +await runEvalite({ + mode: "watch-for-file-changes", + path: "chat.eval.ts", + hideTable: true, +}); +``` + +### Custom Storage + +```typescript +import { runEvalite } from "evalite/runner"; +import { createSqliteStorage } from "evalite/sqlite-storage"; + +const storage = createSqliteStorage("./evalite.db"); + +await runEvalite({ + mode: "run-once-and-exit", + storage, +}); +``` + +### Multi-Environment Testing + +```typescript +import { runEvalite } from "evalite/runner"; + +const environments = [ + { name: "staging", url: "https://staging.example.com" }, + { name: "production", url: "https://example.com" }, +]; + +for (const env of environments) { + console.log(`Running evals for ${env.name}...`); + + process.env.API_URL = env.url; + + await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 80, + outputPath: `./results-${env.name}.json`, + }); +} +``` + +### Parallel Eval Execution + +```typescript +import { runEvalite } from "evalite/runner"; + +// Run multiple eval sets in parallel +await Promise.all([ + runEvalite({ + mode: "run-once-and-exit", + path: "chat.eval.ts", + }), + runEvalite({ + mode: "run-once-and-exit", + path: "completion.eval.ts", + }), +]); +``` + +## Configuration Priority + +Options merge in this order (highest to lowest priority): + +1. Function arguments (`opts`) +2. Config file (`evalite.config.ts`) +3. Defaults + +Example: + +```typescript +// evalite.config.ts +export default defineConfig({ + scoreThreshold: 70, + hideTable: true, +}); + +// script.ts +await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 80, // Overrides config (80 used) + // hideTable not specified, uses config (true) +}); +``` + +## Error Handling + +The function throws an error if: + +- Evals fail to run +- Score threshold is not met +- Invalid options are provided + +```typescript +try { + await runEvalite({ + mode: "run-once-and-exit", + scoreThreshold: 90, + }); +} catch (error) { + console.error("Eval run failed:", error); + // Handle error or exit + process.exit(1); +} +``` + +## Return Value + +Returns a `Promise`. The function completes when: + +- `run-once-and-exit`: All evals finish +- `watch-for-file-changes`: Never (runs indefinitely) +- `run-once-and-serve`: All evals finish, but UI server keeps process alive + +## Deprecated Alias + +`runVitest()` is a deprecated alias for `runEvalite()`. Use `runEvalite()` instead. + +## See Also + +- [CLI](/api/cli) - Command-line interface +- [defineConfig()](/api/define-config) - Configuration file +- [Storage](/api/storage) - Custom storage backends +- [Run Evals Programmatically Guide](/tips/run-evals-programmatically) - More examples diff --git a/apps/evalite-docs/src/content/docs/api/storage.mdx b/apps/evalite-docs/src/content/docs/api/storage.mdx new file mode 100644 index 00000000..1d3b4462 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/storage.mdx @@ -0,0 +1,335 @@ +--- +title: Storage +--- + +Storage backends for persisting evaluation results. Evalite provides built-in SQLite and in-memory storage, plus a Storage interface for custom implementations. + +## Built-in Storage + +### `createSqliteStorage()` + +Create a SQLite storage backend for persistent storage. + +**Signature:** + +```typescript +createSqliteStorage(dbLocation: string): Promise +``` + +**Parameters:** + +- `dbLocation` - Path to the SQLite database file (e.g., `"./evalite.db"`) + +**Returns:** A Promise that resolves to a `SqliteStorage` instance implementing the `Evalite.Storage` interface. + +**Example:** + +```typescript +import { defineConfig } from "evalite/config"; +import { createSqliteStorage } from "evalite/sqlite-storage"; + +export default defineConfig({ + storage: () => createSqliteStorage("./evalite.db"), +}); +``` + +**Features:** + +- Persistent storage across runs +- Automatic schema management +- History tracking for comparing runs +- Used by default when no storage is configured + +### `createInMemoryStorage()` + +Create an in-memory storage backend. Data is lost when the process exits. + +**Signature:** + +```typescript +createInMemoryStorage(): InMemoryStorage +``` + +**Returns:** An `InMemoryStorage` instance implementing the `Evalite.Storage` interface. + +**Example:** + +```typescript +import { defineConfig } from "evalite/config"; +import { createInMemoryStorage } from "evalite/in-memory-storage"; + +export default defineConfig({ + storage: () => createInMemoryStorage(), +}); +``` + +**Features:** + +- Fast (no I/O operations) +- No persistence +- Useful for testing or ephemeral runs + +## Storage Interface + +The `Evalite.Storage` interface allows you to implement custom storage backends (e.g., PostgreSQL, Turso, cloud storage). + +### Interface Definition + +```typescript +interface Storage { + runs: { + create(opts: CreateOpts): Promise; + getMany(opts?: GetManyOpts): Promise; + }; + + suites: { + create(opts: CreateOpts): Promise; + update(opts: UpdateOpts): Promise; + getMany(opts?: GetManyOpts): Promise; + }; + + evals: { + create(opts: CreateOpts): Promise; + update(opts: UpdateOpts): Promise; + getMany(opts?: GetManyOpts): Promise; + }; + + scores: { + create(opts: CreateOpts): Promise; + getMany(opts?: GetManyOpts): Promise; + }; + + traces: { + create(opts: CreateOpts): Promise; + getMany(opts?: GetManyOpts): Promise; + }; + + close(): Promise; + [Symbol.asyncDispose](): Promise; +} +``` + +### Entity Types + +Storage backends must return these entity types: + +**Run:** + +```typescript +type Run = { + id: number; + runType: "full" | "partial"; + created_at: string; // ISO 8601 timestamp +}; +``` + +**Suite:** + +```typescript +type Suite = { + id: number; + run_id: number; + name: string; + status: "fail" | "success" | "running"; + filepath: string; + duration: number; // milliseconds + created_at: string; + variant_name?: string; + variant_group?: string; +}; +``` + +**Eval:** + +```typescript +type Eval = { + id: number; + suite_id: number; + duration: number; // milliseconds + input: unknown; + output: unknown; + expected?: unknown; + created_at: string; + col_order: number; + status: "fail" | "success" | "running"; + rendered_columns?: unknown; + trial_index?: number | null; +}; +``` + +**Score:** + +```typescript +type Score = { + id: number; + eval_id: number; + name: string; + score: number; // 0-1 + description?: string; + metadata?: unknown; + created_at: string; +}; +``` + +**Trace:** + +```typescript +type Trace = { + id: number; + eval_id: number; + input: unknown; + output: unknown; + usage?: { + inputTokens: number; + outputTokens: number; + totalTokens: number; + }; + start: number; // timestamp + end: number; // timestamp + created_at: string; +}; +``` + +## Implementing Custom Storage + +Create a class that implements the `Evalite.Storage` interface: + +```typescript +import type { Evalite } from "evalite/types"; + +export class PostgresStorage implements Evalite.Storage { + constructor(private connectionString: string) {} + + runs = { + async create(opts: Evalite.Storage.Runs.CreateOpts) { + // Insert run into Postgres + // Return Evalite.Storage.Entities.Run + }, + async getMany(opts?: Evalite.Storage.Runs.GetManyOpts) { + // Query runs from Postgres + // Return Evalite.Storage.Entities.Run[] + }, + }; + + suites = { + async create(opts: Evalite.Storage.Suites.CreateOpts) { + // ... + }, + async update(opts: Evalite.Storage.Suites.UpdateOpts) { + // ... + }, + async getMany(opts?: Evalite.Storage.Suites.GetManyOpts) { + // ... + }, + }; + + evals = { + async create(opts: Evalite.Storage.Evals.CreateOpts) { + // ... + }, + async update(opts: Evalite.Storage.Evals.UpdateOpts) { + // ... + }, + async getMany(opts?: Evalite.Storage.Evals.GetManyOpts) { + // ... + }, + }; + + scores = { + async create(opts: Evalite.Storage.Scores.CreateOpts) { + // ... + }, + async getMany(opts?: Evalite.Storage.Scores.GetManyOpts) { + // ... + }, + }; + + traces = { + async create(opts: Evalite.Storage.Traces.CreateOpts) { + // ... + }, + async getMany(opts?: Evalite.Storage.Traces.GetManyOpts) { + // ... + }, + }; + + async close() { + // Close database connection + } + + async [Symbol.asyncDispose]() { + await this.close(); + } +} + +// Factory function +export const createPostgresStorage = ( + connectionString: string +): PostgresStorage => { + return new PostgresStorage(connectionString); +}; +``` + +### Using Custom Storage + +```typescript +// evalite.config.ts +import { defineConfig } from "evalite/config"; +import { createPostgresStorage } from "./postgres-storage"; + +export default defineConfig({ + storage: () => createPostgresStorage(process.env.DATABASE_URL), +}); +``` + +## Storage Lifecycle + +Storage instances are managed using the `await using` syntax: + +```typescript +import { createSqliteStorage } from "evalite/sqlite-storage"; + +await using storage = createSqliteStorage("./evalite.db"); + +// Use storage... +// Automatically closed when leaving scope +``` + +Implement `[Symbol.asyncDispose]()` to ensure proper cleanup. + +## Query Options + +### Common Query Patterns + +**Get latest run:** + +```typescript +const runs = await storage.runs.getMany({ limit: 1 }); +const latestRun = runs[0]; +``` + +**Get suites for a run:** + +```typescript +const suites = await storage.suites.getMany({ run_id: runId }); +``` + +**Get evals with scores:** + +```typescript +const evals = await storage.evals.getMany({ suite_id: suiteId }); +const scores = await storage.scores.getMany({ eval_id: evalId }); +``` + +## Best Practices + +1. **Use SQLite for persistence** - Default and recommended for most use cases +2. **Use in-memory for CI** - Faster, no cleanup needed +3. **Implement proper cleanup** - Use `close()` and `[Symbol.asyncDispose]()` +4. **Handle JSON fields** - input/output/expected/metadata are stored as JSON +5. **Index appropriately** - Optimize queries for run_id, suite_id, eval_id lookups + +## See Also + +- [defineConfig()](/api/define-config) - Configure storage in config file +- [runEvalite()](/api/run-evalite) - Pass storage instance programmatically diff --git a/apps/evalite-docs/src/content/docs/api/traces.mdx b/apps/evalite-docs/src/content/docs/api/traces.mdx new file mode 100644 index 00000000..08867794 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/api/traces.mdx @@ -0,0 +1,296 @@ +--- +title: Traces +--- + +Track nested LLM calls and intermediate steps within your evaluations. + +## Overview + +Traces allow you to record individual LLM calls or processing steps that occur during task execution. They appear in the Evalite UI alongside your main input/output, helping you debug and understand the full execution flow. + +## Functions + +### `reportTrace()` + +Manually report a trace for custom LLM calls or processing steps. + +**Signature:** + +```typescript +reportTrace(trace: { + input: unknown; + output: unknown; + usage?: { + inputTokens: number; + outputTokens: number; + totalTokens: number; + }; + start?: number; + end?: number; +}): void +``` + +**Parameters:** + +- `input` - The input to the operation (e.g., prompt, messages) +- `output` - The output from the operation (e.g., LLM response) +- `usage` (optional) - Token usage statistics +- `start` (optional) - Start timestamp (milliseconds). Defaults to current time. +- `end` (optional) - End timestamp (milliseconds). Defaults to current time. + +**Example:** + +```typescript +import { evalite } from "evalite"; +import { reportTrace } from "evalite/traces"; + +evalite("Multi-Step Analysis", { + data: [{ input: "Analyze this text" }], + task: async (input) => { + // First LLM call + reportTrace({ + input: { prompt: "Summarize: " + input }, + output: { text: "Summary of the text" }, + usage: { + inputTokens: 50, + outputTokens: 20, + totalTokens: 70, + }, + }); + + // Second LLM call + reportTrace({ + input: { prompt: "Translate to Spanish: Summary of the text" }, + output: { text: "Resumen del texto" }, + usage: { + inputTokens: 30, + outputTokens: 15, + totalTokens: 45, + }, + }); + + return "Final result"; + }, + scorers: [], +}); +``` + +**Usage with timestamps:** + +```typescript +const start = performance.now(); +const result = await callLLM(input); +const end = performance.now(); + +reportTrace({ + input, + output: result, + start, + end, +}); +``` + +### `traceAISDKModel()` + +Automatically trace all calls made with a Vercel AI SDK model. + +**Signature:** + +```typescript +traceAISDKModel(model: LanguageModelV2): LanguageModelV2 +``` + +**Parameters:** + +- `model` - A Vercel AI SDK language model (from `@ai-sdk/openai`, etc.) + +**Returns:** A wrapped model that automatically reports traces. + +**Example:** + +```typescript +import { evalite } from "evalite"; +import { traceAISDKModel } from "evalite/ai-sdk"; +import { openai } from "@ai-sdk/openai"; +import { generateText } from "ai"; + +// Wrap your model +const tracedModel = traceAISDKModel(openai("gpt-4")); + +evalite("AI SDK Eval", { + data: [{ input: "Hello" }], + task: async (input) => { + // All calls with this model are automatically traced + const result = await generateText({ + model: tracedModel, + prompt: input, + }); + + return result.text; + }, + scorers: [], +}); +``` + +**With streaming:** + +```typescript +import { streamText } from "ai"; + +const tracedModel = traceAISDKModel(openai("gpt-4")); + +evalite("Streaming Eval", { + data: [{ input: "Hello" }], + task: async (input) => { + const result = await streamText({ + model: tracedModel, + prompt: input, + }); + + // Process the stream before returning + const text = await result.text; + return text; + }, + scorers: [], +}); +``` + +## Enabling Traces + +Traces are only recorded when the `EVALITE_REPORT_TRACES` environment variable is set: + +```bash +EVALITE_REPORT_TRACES=true evalite watch +``` + +Or in your `.env` file: + +``` +EVALITE_REPORT_TRACES=true +``` + +This prevents unnecessary overhead when traces aren't needed. + +## What Gets Traced + +### With `reportTrace()` + +You control exactly what gets traced: + +```typescript +reportTrace({ + input: "Whatever you want to log", + output: { any: "data structure" }, +}); +``` + +### With `traceAISDKModel()` + +Automatically traces: + +- Full prompt/messages +- Model responses (text and tool calls) +- Token usage +- Timing information + +## Viewing Traces in the UI + +Traces appear in the Evalite UI under each test case: + +1. Navigate to an eval result +2. Click on a specific test case +3. View the "Traces" section to see all nested calls +4. Inspect input, output, and timing for each trace + +## Complete Example + +```typescript +import { evalite } from "evalite"; +import { reportTrace, traceAISDKModel } from "evalite/traces"; +import { openai } from "@ai-sdk/openai"; +import { generateText } from "ai"; + +const tracedModel = traceAISDKModel(openai("gpt-4")); + +evalite("Research Agent", { + data: [ + { + input: "What is the capital of France?", + expected: "Paris", + }, + ], + task: async (input) => { + // Step 1: Extract intent (manually traced) + const intent = await extractIntent(input); + reportTrace({ + input: { query: input }, + output: { intent }, + }); + + // Step 2: Generate response (automatically traced via AI SDK) + const result = await generateText({ + model: tracedModel, + prompt: `Answer this question: ${input}`, + }); + + // Step 3: Format result (manually traced) + const formatted = formatResponse(result.text); + reportTrace({ + input: { raw: result.text }, + output: { formatted }, + }); + + return formatted; + }, + scorers: [ + { + name: "Exact Match", + scorer: ({ output, expected }) => { + return output === expected ? 1 : 0; + }, + }, + ], +}); +``` + +## Best Practices + +1. **Use `traceAISDKModel()` for AI SDK calls** - Automatic tracing with rich context +2. **Use `reportTrace()` for custom logic** - Track non-LLM steps (parsing, validation, etc.) +3. **Include usage data when available** - Helps track costs and performance +4. **Keep trace data relevant** - Don't trace every small operation, focus on meaningful steps +5. **Enable only when needed** - Use `EVALITE_REPORT_TRACES=true` during development/debugging + +## Troubleshooting + +### Traces not appearing + +Make sure `EVALITE_REPORT_TRACES=true` is set: + +```bash +EVALITE_REPORT_TRACES=true evalite watch +``` + +### Error: "reportTrace must be called inside an evalite eval" + +`reportTrace()` can only be called within the `task` function of an eval: + +```typescript +// ✅ Correct +evalite("My Eval", { + data: [{ input: "test" }], + task: async (input) => { + reportTrace({ input, output: "result" }); // Works + return "result"; + }, +}); + +// ❌ Wrong +reportTrace({ input: "test", output: "result" }); // Outside eval +``` + +## See Also + +- [Adding Traces Guide](/tips/adding-traces) - Overview and examples +- [Vercel AI SDK Integration](/tips/vercel-ai-sdk) - Using AI SDK with Evalite +- [evalite()](/api/evalite) - Main evaluation function diff --git a/apps/evalite-docs/src/content/docs/guides/cli.mdx b/apps/evalite-docs/src/content/docs/guides/cli.mdx deleted file mode 100644 index 538d90ca..00000000 --- a/apps/evalite-docs/src/content/docs/guides/cli.mdx +++ /dev/null @@ -1,80 +0,0 @@ ---- -title: CLI ---- - -## Watch Mode - -You can run Evalite in watch mode by running `evalite watch`: - -```bash -evalite watch -``` - -This will watch for changes to your `.eval.ts` files and re-run the evals when they change. - -> [!IMPORTANT] -> -> I strongly recommend implementing a caching layer in your LLM calls when using watch mode. This will keep your evals running fast and avoid burning through your API credits. - -### Hiding the Table Output - -When debugging with `console.log`, the detailed table output can make it harder to see your logs. You can hide it with `--hideTable`: - -```bash -evalite watch --hideTable -``` - -This keeps the score summary but removes the detailed results table from the CLI output. - -## Serve Mode - -You can run evals once and serve the UI without re-running on file changes: - -```bash -evalite serve -``` - -This runs your evals once and keeps the UI server running at `http://localhost:3006`. Unlike watch mode, tests won't re-run when files change. - -Since evals can take a while to run, this can be a useful alternative to watch mode. - -To re-run evals after making changes, restart `evalite serve`. - -## Running Specific Files - -You can run specific files by passing them as arguments: - -```bash -evalite my-eval.eval.ts -``` - -This also works for `watch` and `serve` modes: - -```bash -evalite watch my-eval.eval.ts -evalite serve my-eval.eval.ts -``` - -## Threshold - -You can tell Evalite that your evals must pass a specific score by passing `--threshold`: - -```bash -evalite --threshold=50 # Score must be greater than or equal to 50 - -evalite watch --threshold=70 # Also works in watch mode -``` - -This is useful for running on CI. If the score threshold is not met, it will fail the process. - -## Export Command - -Export eval results as a static HTML bundle: - -```bash -evalite export -``` - -This exports the latest run to `./evalite-export` by default. - -See the [CI/CD guide](/guides/ci) for full documentation on exporting and viewing static UI bundles. diff --git a/apps/evalite-docs/src/content/docs/guides/configuration.mdx b/apps/evalite-docs/src/content/docs/guides/configuration.mdx index 2c4ca11f..046400e7 100644 --- a/apps/evalite-docs/src/content/docs/guides/configuration.mdx +++ b/apps/evalite-docs/src/content/docs/guides/configuration.mdx @@ -68,36 +68,3 @@ export default defineConfig({ testTimeout: 60000, // 60 seconds }); ``` - -### Running Evals Multiple Times - -Run each test case multiple times to measure variance in non-deterministic evaluations. - -Configure globally in `evalite.config.ts`: - -```ts -// evalite.config.ts -import { defineConfig } from "evalite/config"; - -export default defineConfig({ - trialCount: 3, // Run each test case 3 times -}); -``` - -Or override per-eval in the `evalite()` call: - -```ts -evalite("Non-deterministic eval", { - data: () => [{ input: "Alice", expected: "Alice" }], - task: async (input) => { - // Non-deterministic task - return getRandomGreeting(input); - }, - scorers: [ - /* ... */ - ], - trialCount: 5, // Override config: run 5 times -}); -``` - -Note: Per-eval `trialCount` overrides `evalite.config.ts` if both are present. diff --git a/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx b/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx deleted file mode 100644 index 24e15b49..00000000 --- a/apps/evalite-docs/src/content/docs/guides/environment-variables.mdx +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: Environment Variables ---- - -import { Steps } from "@astrojs/starlight/components"; - -To call your LLM from a third-party service, you'll likely need some environment variables to keep your API keys safe. - -## Setting Up Env Variables - - - -1. Create a `.env` file in the root of your project: - - ``` - // .env - OPENAI_API_KEY=your-api-key - ``` - -2. Add `.env` to your `.gitignore`, if it's not already there - - ``` - // .gitignore - .env - ``` - -3. Install `dotenv`: - - ```bash - pnpm add -D dotenv - ``` - -4. Add an `evalite.config.ts` file: - - ```ts - // evalite.config.ts - - import { defineConfig } from "evalite/config"; - - export default defineConfig({ - setupFiles: ["dotenv/config"], - }); - ``` - - - -Now, your environment variables will be available in your evals. diff --git a/apps/evalite-docs/src/content/docs/quickstart.mdx b/apps/evalite-docs/src/content/docs/guides/quickstart.mdx similarity index 100% rename from apps/evalite-docs/src/content/docs/quickstart.mdx rename to apps/evalite-docs/src/content/docs/guides/quickstart.mdx diff --git a/apps/evalite-docs/src/content/docs/guides/skipping.mdx b/apps/evalite-docs/src/content/docs/guides/skipping.mdx deleted file mode 100644 index a443da01..00000000 --- a/apps/evalite-docs/src/content/docs/guides/skipping.mdx +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: Skipping Evals ---- - -## Skipping Entire Evals - -You can use `evalite.skip()` to skip an entire eval without running it. - -```ts -evalite.skip("My Eval", { - data: () => [], - task: () => {}, -}); -``` - -This is useful when you want to temporarily disable an eval during development or testing. - -## Focusing on Specific Evals - -You can use the `only` flag on data entries to focus on specific inputs during development. - -```ts -evalite("My Eval", { - data: () => [ - { input: "test1", expected: "output1" }, - { input: "test2", expected: "output2", only: true }, - { input: "test3", expected: "output3" }, - ], - task: async (input) => { - // Only runs for "test2" - }, -}); -``` - -When any data entry has `only: true`, only those evals will be run. diff --git a/apps/evalite-docs/src/content/docs/guides/streams.md b/apps/evalite-docs/src/content/docs/guides/streams.md deleted file mode 100644 index ce5fa063..00000000 --- a/apps/evalite-docs/src/content/docs/guides/streams.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Streams ---- - -You can handle streams in Evalite by returning any async iterable (including a `ReadableStream`) from your task. This means you can test functions like the AI SDK `streamText` function easily: - -```ts -import { evalite } from "evalite"; -import { streamText } from "ai"; -import { openai } from "@ai-sdk/openai"; -import { Factuality } from "autoevals"; - -evalite("My Eval", { - data: [{ input: "What is the capital of France?", expected: "Paris" }], - task: async (input) => { - const result = await streamText({ - model: openai("your-model"), - system: `Answer the question concisely.`, - prompt: input, - }); - - return result.textStream; - }, - scorers: [Factuality], -}); -``` diff --git a/apps/evalite-docs/src/content/docs/what-is-evalite.mdx b/apps/evalite-docs/src/content/docs/guides/what-is-evalite.mdx similarity index 100% rename from apps/evalite-docs/src/content/docs/what-is-evalite.mdx rename to apps/evalite-docs/src/content/docs/guides/what-is-evalite.mdx diff --git a/apps/evalite-docs/src/content/docs/guides/traces.mdx b/apps/evalite-docs/src/content/docs/tips/adding-traces.mdx similarity index 54% rename from apps/evalite-docs/src/content/docs/guides/traces.mdx rename to apps/evalite-docs/src/content/docs/tips/adding-traces.mdx index f62e79e6..bbb395a8 100644 --- a/apps/evalite-docs/src/content/docs/guides/traces.mdx +++ b/apps/evalite-docs/src/content/docs/tips/adding-traces.mdx @@ -1,14 +1,14 @@ --- -title: Traces +title: Adding Traces --- import { Aside } from "@astrojs/starlight/components"; -Traces are used to track the behaviour of each individual call to an LLM inside your task. +Track timing, token usage, and input/output for each LLM call within your task using traces. ## `reportTrace` -You can report a trace by calling `reportTrace` inside an `evalite` eval: +Report a trace by calling `reportTrace` inside an `evalite` eval: ```ts import { evalite, type Evalite } from "evalite"; @@ -54,27 +54,4 @@ evalite("My Eval", { -## `traceAISDKModel` - -If you're using the [Vercel AI SDK](https://sdk.vercel.ai/docs/introduction), you can automatically report traces by wrapping your model in `traceAISDKModel` function: - -```ts -import { traceAISDKModel } from "evalite/ai-sdk"; -import { generateText } from "ai"; -import { openai } from "@ai-sdk/openai"; - -// All calls to this model will be recorded in evalite! -const tracedModel = traceAISDKModel(openai("gpt-4o-mini")); - -const result = await generateText({ - model: tracedModel, - system: `Answer the question concisely.`, - prompt: `What is the capital of France?`, -}); -``` - - +If you're using the Vercel AI SDK, see the [Vercel AI SDK](/tips/vercel-ai-sdk) tip for automatic tracing with `traceAISDKModel`. diff --git a/apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx b/apps/evalite-docs/src/content/docs/tips/comparing-different-approaches.mdx similarity index 80% rename from apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx rename to apps/evalite-docs/src/content/docs/tips/comparing-different-approaches.mdx index 029e4142..5d8c806f 100644 --- a/apps/evalite-docs/src/content/docs/guides/variant-comparison.mdx +++ b/apps/evalite-docs/src/content/docs/tips/comparing-different-approaches.mdx @@ -1,17 +1,16 @@ --- -title: Variant Comparison -description: Compare multiple task variants using evalite.each() +title: Comparing Different Approaches --- import { Aside } from "@astrojs/starlight/components"; -## Overview +A/B test different models, prompts, or configurations on the same dataset using `evalite.each()`. -`evalite.each()` enables comparing multiple task variants (models, prompts, configs) within a single eval. This lets you: +## What You Can Compare -- Compare different models on the same dataset -- A/B test prompt strategies -- Test different config parameters (temperature, system prompts, etc.) +- Different models on the same dataset +- Prompt strategies (direct vs chain-of-thought vs few-shot) +- Config parameters (temperature, system prompts, etc.) ## Basic Usage diff --git a/apps/evalite-docs/src/content/docs/guides/customizing-the-ui.mdx b/apps/evalite-docs/src/content/docs/tips/customize-the-ui.mdx similarity index 85% rename from apps/evalite-docs/src/content/docs/guides/customizing-the-ui.mdx rename to apps/evalite-docs/src/content/docs/tips/customize-the-ui.mdx index b7f16b05..f79f08ad 100644 --- a/apps/evalite-docs/src/content/docs/guides/customizing-the-ui.mdx +++ b/apps/evalite-docs/src/content/docs/tips/customize-the-ui.mdx @@ -1,12 +1,14 @@ --- -title: Customizing The UI +title: Customize The UI --- import { Aside } from "@astrojs/starlight/components"; +Customize which columns appear in the Evalite UI to show only the data you care about. + ## Creating Custom Columns -By default, the Evalite UI renders the input, expected and output columns: +By default, Evalite renders input, expected and output columns: | Input | Expected | Output | | ------------------------ | --------------------------- | ---------------- | diff --git a/apps/evalite-docs/src/content/docs/guides/multi-modal.mdx b/apps/evalite-docs/src/content/docs/tips/images-and-media.mdx similarity index 95% rename from apps/evalite-docs/src/content/docs/guides/multi-modal.mdx rename to apps/evalite-docs/src/content/docs/tips/images-and-media.mdx index 8f747424..5e2cf3c5 100644 --- a/apps/evalite-docs/src/content/docs/guides/multi-modal.mdx +++ b/apps/evalite-docs/src/content/docs/tips/images-and-media.mdx @@ -1,10 +1,10 @@ --- -title: Multi-Modal +title: Images And Media --- import { Aside } from "@astrojs/starlight/components"; -Evalite can handle not just text responses, but media like images, audio, and video. +Test multi-modal LLM features by including images, audio, and video in your evals. ## Files In Memory diff --git a/apps/evalite-docs/src/content/docs/tips/only-run-certain-evals.mdx b/apps/evalite-docs/src/content/docs/tips/only-run-certain-evals.mdx new file mode 100644 index 00000000..0049c4e2 --- /dev/null +++ b/apps/evalite-docs/src/content/docs/tips/only-run-certain-evals.mdx @@ -0,0 +1,60 @@ +--- +title: Only Run Certain Evals +--- + +Skip entire evals or focus on specific test cases during development. + +## Skip Entire Evals + +Use `evalite.skip()` to skip an entire eval without running it. + +```ts +evalite.skip("My Eval", { + data: () => [], + task: () => {}, +}); +``` + +This is useful when you want to temporarily disable an eval during development or testing. + +## Focus on Specific Test Cases + +Use the `only` flag on data entries to focus on specific inputs during development. + +```ts +evalite("My Eval", { + data: () => [ + { input: "test1", expected: "output1" }, + { input: "test2", expected: "output2", only: true }, + { input: "test3", expected: "output3" }, + ], + task: async (input) => { + // Only runs for "test2" + }, +}); +``` + +When any data entry has `only: true`, only those evals will be run. + +## Run Specific Files + +Run specific eval files instead of all evals by passing file paths as arguments. + +```bash +evalite my-eval.eval.ts +``` + +Run multiple files: + +```bash +evalite eval1.eval.ts eval2.eval.ts +``` + +Works with `watch` and `serve` modes: + +```bash +evalite watch my-eval.eval.ts +evalite serve my-eval.eval.ts +``` + +This is useful when working on a specific eval and you don't want to run the entire test suite. diff --git a/apps/evalite-docs/src/content/docs/guides/ci.mdx b/apps/evalite-docs/src/content/docs/tips/run-evals-on-ci-cd.mdx similarity index 94% rename from apps/evalite-docs/src/content/docs/guides/ci.mdx rename to apps/evalite-docs/src/content/docs/tips/run-evals-on-ci-cd.mdx index 37809b94..a56738c2 100644 --- a/apps/evalite-docs/src/content/docs/guides/ci.mdx +++ b/apps/evalite-docs/src/content/docs/tips/run-evals-on-ci-cd.mdx @@ -2,7 +2,7 @@ title: CI/CD --- -Evalite integrates seamlessly into CI/CD pipelines, allowing you to validate LLM-powered features as part of your automated testing workflow. +Integrate Evalite into CI/CD pipelines to validate LLM-powered features automatically. ## Static UI Export diff --git a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx b/apps/evalite-docs/src/content/docs/tips/run-evals-programmatically.mdx similarity index 92% rename from apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx rename to apps/evalite-docs/src/content/docs/tips/run-evals-programmatically.mdx index 53252ea1..fce8c229 100644 --- a/apps/evalite-docs/src/content/docs/guides/running-programmatically.mdx +++ b/apps/evalite-docs/src/content/docs/tips/run-evals-programmatically.mdx @@ -1,8 +1,8 @@ --- -title: Running Programmatically +title: Run Evals Programmatically --- -You can run Evalite programmatically using the Node API. This is useful when you want to integrate Evalite into your own scripts, CI/CD pipelines, or custom tooling. +Use Evalite's Node API to integrate evals into custom scripts, CI/CD pipelines, or tooling. ## Basic Usage diff --git a/apps/evalite-docs/src/content/docs/tips/run-same-eval-multiple-times.mdx b/apps/evalite-docs/src/content/docs/tips/run-same-eval-multiple-times.mdx new file mode 100644 index 00000000..e0db2f1b --- /dev/null +++ b/apps/evalite-docs/src/content/docs/tips/run-same-eval-multiple-times.mdx @@ -0,0 +1,38 @@ +--- +title: Run Same Eval Multiple Times +--- + +Use `trialCount` to run each test case multiple times, measuring variance in non-deterministic evaluations. + +## Global Configuration + +Set `trialCount` in `evalite.config.ts` to apply to all evals: + +```ts +// evalite.config.ts +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + trialCount: 3, // Run each test case 3 times +}); +``` + +## Per-Eval Override + +Override the global setting for a specific eval: + +```ts +evalite("Non-deterministic eval", { + data: () => [{ input: "Alice", expected: "Alice" }], + task: async (input) => { + // Non-deterministic task + return getRandomGreeting(input); + }, + scorers: [ + /* ... */ + ], + trialCount: 5, // Override config: run 5 times +}); +``` + +Per-eval `trialCount` takes precedence over `evalite.config.ts`. diff --git a/apps/evalite-docs/src/content/docs/tips/score-thresholds.mdx b/apps/evalite-docs/src/content/docs/tips/score-thresholds.mdx new file mode 100644 index 00000000..ae1f9f6d --- /dev/null +++ b/apps/evalite-docs/src/content/docs/tips/score-thresholds.mdx @@ -0,0 +1,49 @@ +--- +title: Score Thresholds +--- + +Require evals to pass a minimum score threshold. Useful for CI/CD pipelines where failing evals should block deployments. + +## Using the Threshold Flag + +Pass `--threshold` to set a minimum required score: + +```bash +evalite --threshold=50 # Score must be >= 50 +``` + +If the average score falls below the threshold, the process exits with code 1. + +## Works with All Modes + +```bash +evalite watch --threshold=70 +evalite serve --threshold=80 +``` + +## Configuration File + +Alternatively, set `scoreThreshold` in `evalite.config.ts`: + +```ts +// evalite.config.ts +import { defineConfig } from "evalite/config"; + +export default defineConfig({ + scoreThreshold: 80, // Fail if average score < 80 +}); +``` + +CLI flag takes precedence over config file setting. + +## CI/CD Usage + +Typical CI workflow: + +```yaml +# .github/workflows/evals.yml +- name: Run evals + run: evalite --threshold=75 +``` + +Process exits with error code if threshold not met, failing the CI job. diff --git a/apps/evalite-docs/src/content/docs/examples/ai-sdk.md b/apps/evalite-docs/src/content/docs/tips/vercel-ai-sdk.mdx similarity index 83% rename from apps/evalite-docs/src/content/docs/examples/ai-sdk.md rename to apps/evalite-docs/src/content/docs/tips/vercel-ai-sdk.mdx index 215e5ea0..4fd6a8d1 100644 --- a/apps/evalite-docs/src/content/docs/examples/ai-sdk.md +++ b/apps/evalite-docs/src/content/docs/tips/vercel-ai-sdk.mdx @@ -1,14 +1,14 @@ --- -title: AI SDK +title: Vercel AI SDK --- -Vercel's [AI SDK](https://sdk.vercel.ai/docs/introduction) is a great way to get started with AI in your apps. +import { Aside } from "@astrojs/starlight/components"; -It abstracts away the differences between different AI providers, so you can **switch between them easily**. +Use Evalite with Vercel's [AI SDK](https://sdk.vercel.ai/docs/introduction) to trace LLM calls and test AI-powered features. -## Tracing +## Automatic Tracing with `traceAISDKModel` -You can use the `traceAISDKModel` function to trace the calls to the AI SDK: +Wrap your AI SDK model in `traceAISDKModel` to automatically track all LLM calls: ```ts // my-eval.eval.ts @@ -48,6 +48,12 @@ evalite("Test Capitals", { }); ``` + + ## Testing Whole Conversations You can also pass messages to the `input` property of the eval. To get autocomplete, you can pass the `CoreMessage` type to the `evalite` function as a type argument. diff --git a/apps/evalite-docs/src/content/docs/tips/watch-mode.mdx b/apps/evalite-docs/src/content/docs/tips/watch-mode.mdx new file mode 100644 index 00000000..06ca895c --- /dev/null +++ b/apps/evalite-docs/src/content/docs/tips/watch-mode.mdx @@ -0,0 +1,37 @@ +--- +title: Watch Mode +--- + +Watch mode re-runs evals when files change, making iteration fast during development. + +## Start Watch Mode + +```bash +evalite watch +``` + +This watches `.eval.ts` files and re-runs them on changes. + +## Hide Table Output + +When debugging with `console.log`, hide the detailed table to see logs clearly: + +```bash +evalite watch --hideTable +``` + +Keeps score summary but removes the detailed results table. + +## Serve Mode Alternative + +Run evals once and serve UI without re-running on file changes: + +```bash +evalite serve +``` + +Runs evals once, keeps UI at `http://localhost:3006`. Tests won't re-run on file changes. + +To re-run after changes, restart `evalite serve`. + +Useful when evals take a long time to run. diff --git a/apps/evalite-ui/.eslintrc.cjs b/apps/evalite-ui/.eslintrc.cjs deleted file mode 100644 index 4f6f59ee..00000000 --- a/apps/evalite-ui/.eslintrc.cjs +++ /dev/null @@ -1,84 +0,0 @@ -/** - * This is intended to be a basic starting point for linting in your app. - * It relies on recommended configs out of the box for simplicity, but you can - * and should modify this configuration to best suit your team's needs. - */ - -/** @type {import('eslint').Linter.Config} */ -module.exports = { - root: true, - parserOptions: { - ecmaVersion: "latest", - sourceType: "module", - ecmaFeatures: { - jsx: true, - }, - }, - env: { - browser: true, - commonjs: true, - es6: true, - }, - ignorePatterns: ["!**/.server", "!**/.client"], - - // Base config - extends: ["eslint:recommended"], - - overrides: [ - // React - { - files: ["**/*.{js,jsx,ts,tsx}"], - plugins: ["react", "jsx-a11y"], - extends: [ - "plugin:react/recommended", - "plugin:react/jsx-runtime", - "plugin:react-hooks/recommended", - "plugin:jsx-a11y/recommended", - ], - settings: { - react: { - version: "detect", - }, - formComponents: ["Form"], - linkComponents: [ - { name: "Link", linkAttribute: "to" }, - { name: "NavLink", linkAttribute: "to" }, - ], - "import/resolver": { - typescript: {}, - }, - }, - }, - - // Typescript - { - files: ["**/*.{ts,tsx}"], - plugins: ["@typescript-eslint", "import"], - parser: "@typescript-eslint/parser", - settings: { - "import/internal-regex": "^~/", - "import/resolver": { - node: { - extensions: [".ts", ".tsx"], - }, - typescript: { - alwaysTryTypes: true, - }, - }, - }, - extends: [ - "plugin:@typescript-eslint/recommended", - "plugin:import/recommended", - "plugin:import/typescript", - ], - }, - - // Node - { - files: [".eslintrc.cjs"], - env: { - node: true, - }, - }, - ], -}; diff --git a/apps/evalite-ui/app/components/display-input.tsx b/apps/evalite-ui/app/components/display-input.tsx index c2d781ab..ff979ea6 100644 --- a/apps/evalite-ui/app/components/display-input.tsx +++ b/apps/evalite-ui/app/components/display-input.tsx @@ -8,9 +8,8 @@ import { } from "lucide-react"; import React, { Fragment, useLayoutEffect, useRef, useState } from "react"; import { JSONTree } from "react-json-tree"; -import ReactMarkdown from "react-markdown"; -import remarkGfm from "remark-gfm"; import { downloadFile, serveFile } from "~/sdk"; +import { Response } from "./response"; import { Button } from "./ui/button"; // Helper function to find single string value in an object and its path @@ -84,12 +83,7 @@ const DisplayText = ({ overflow: "hidden", }} > - - {input} - + {input} {status === "showing-show-more-button" && shouldTruncateText && ( diff --git a/apps/evalite-ui/app/components/page-layout.tsx b/apps/evalite-ui/app/components/page-layout.tsx index 1691a48c..804c1dac 100644 --- a/apps/evalite-ui/app/components/page-layout.tsx +++ b/apps/evalite-ui/app/components/page-layout.tsx @@ -3,7 +3,6 @@ import { BreadcrumbItem, BreadcrumbLink, BreadcrumbList, - BreadcrumbPage, } from "./ui/breadcrumb"; import { Separator } from "./ui/separator"; import { SidebarTrigger } from "./ui/sidebar"; diff --git a/apps/evalite-ui/app/components/response.tsx b/apps/evalite-ui/app/components/response.tsx new file mode 100644 index 00000000..95aba757 --- /dev/null +++ b/apps/evalite-ui/app/components/response.tsx @@ -0,0 +1,21 @@ +import { cn } from "~/lib/utils"; +import { type ComponentProps, memo } from "react"; +import { Streamdown } from "streamdown"; + +type ResponseProps = ComponentProps; + +export const Response = memo( + ({ className, ...props }: ResponseProps) => ( + *:first-child]:mt-0 [&>*:last-child]:mb-0", + className + )} + {...props} + /> + ), + (prevProps, nextProps) => prevProps.children === nextProps.children +); + +Response.displayName = "Response"; diff --git a/apps/evalite-ui/app/components/ui/copy-button.tsx b/apps/evalite-ui/app/components/ui/copy-button.tsx index 54a16373..4c3d0eb8 100644 --- a/apps/evalite-ui/app/components/ui/copy-button.tsx +++ b/apps/evalite-ui/app/components/ui/copy-button.tsx @@ -1,12 +1,7 @@ +import { CheckIcon, CopyIcon } from "lucide-react"; import * as React from "react"; -import { CopyIcon, CheckIcon } from "lucide-react"; -import { cn } from "~/lib/utils"; import { Button } from "~/components/ui/button"; -import { - Tooltip, - TooltipContent, - TooltipTrigger, -} from "~/components/ui/tooltip"; +import { cn } from "~/lib/utils"; interface CopyButtonProps extends React.ButtonHTMLAttributes { diff --git a/apps/evalite-ui/app/components/ui/input-group.tsx b/apps/evalite-ui/app/components/ui/input-group.tsx new file mode 100644 index 00000000..8168f717 --- /dev/null +++ b/apps/evalite-ui/app/components/ui/input-group.tsx @@ -0,0 +1,168 @@ +import * as React from "react"; +import { cva, type VariantProps } from "class-variance-authority"; + +import { cn } from "~/lib/utils"; +import { Button } from "~/components/ui/button"; +import { Input } from "~/components/ui/input"; +import { Textarea } from "~/components/ui/textarea"; + +function InputGroup({ className, ...props }: React.ComponentProps<"div">) { + return ( +
textarea]:h-auto", + + // Variants based on alignment. + "has-[>[data-align=inline-start]]:[&>input]:pl-2", + "has-[>[data-align=inline-end]]:[&>input]:pr-2", + "has-[>[data-align=block-start]]:h-auto has-[>[data-align=block-start]]:flex-col has-[>[data-align=block-start]]:[&>input]:pb-3", + "has-[>[data-align=block-end]]:h-auto has-[>[data-align=block-end]]:flex-col has-[>[data-align=block-end]]:[&>input]:pt-3", + + // Focus state. + "has-[[data-slot=input-group-control]:focus-visible]:ring-ring has-[[data-slot=input-group-control]:focus-visible]:ring-1", + + // Error state. + "has-[[data-slot][aria-invalid=true]]:ring-destructive/20 has-[[data-slot][aria-invalid=true]]:border-destructive dark:has-[[data-slot][aria-invalid=true]]:ring-destructive/40", + + className + )} + {...props} + /> + ); +} + +const inputGroupAddonVariants = cva( + "text-muted-foreground flex h-auto cursor-text select-none items-center justify-center gap-2 py-1.5 text-sm font-medium group-data-[disabled=true]/input-group:opacity-50 [&>kbd]:rounded-[calc(var(--radius)-5px)] [&>svg:not([class*='size-'])]:size-4", + { + variants: { + align: { + "inline-start": + "order-first pl-3 has-[>button]:ml-[-0.45rem] has-[>kbd]:ml-[-0.35rem]", + "inline-end": + "order-last pr-3 has-[>button]:mr-[-0.4rem] has-[>kbd]:mr-[-0.35rem]", + "block-start": + "[.border-b]:pb-3 order-first w-full justify-start px-3 pt-3 group-has-[>input]/input-group:pt-2.5", + "block-end": + "[.border-t]:pt-3 order-last w-full justify-start px-3 pb-3 group-has-[>input]/input-group:pb-2.5", + }, + }, + defaultVariants: { + align: "inline-start", + }, + } +); + +function InputGroupAddon({ + className, + align = "inline-start", + ...props +}: React.ComponentProps<"div"> & VariantProps) { + return ( +
{ + if ((e.target as HTMLElement).closest("button")) { + return; + } + e.currentTarget.parentElement?.querySelector("input")?.focus(); + }} + {...props} + /> + ); +} + +const inputGroupButtonVariants = cva( + "flex items-center gap-2 text-sm shadow-none", + { + variants: { + size: { + xs: "h-6 gap-1 rounded-[calc(var(--radius)-5px)] px-2 has-[>svg]:px-2 [&>svg:not([class*='size-'])]:size-3.5", + sm: "h-8 gap-1.5 rounded-md px-2.5 has-[>svg]:px-2.5", + "icon-xs": + "size-6 rounded-[calc(var(--radius)-5px)] p-0 has-[>svg]:p-0", + "icon-sm": "size-8 p-0 has-[>svg]:p-0", + }, + }, + defaultVariants: { + size: "xs", + }, + } +); + +function InputGroupButton({ + className, + type = "button", + variant = "ghost", + size = "xs", + ...props +}: Omit, "size"> & + VariantProps) { + return ( +