Skip to content

Commit

Permalink
Log evals as JSON (#23)
Browse files Browse the repository at this point in the history
Also, test out evals tool in CI.
  • Loading branch information
markuswustenberg authored Jan 10, 2025
1 parent 59b057e commit 99660a1
Showing 7 changed files with 94 additions and 4 deletions.
25 changes: 24 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -60,8 +60,31 @@ jobs:
- name: Get dependencies
run: go mod download

- name: Install evals
run: go install maragu.dev/evals

- name: Evaluate
run: go test -json -run TestEval ./... | jq 'select(.Test != null and .Action == "output" and (.Output | contains("result"))) | del(.Action)'
run: go test -json -run TestEval ./... | evals | tee evals.txt >> $GITHUB_STEP_SUMMARY

- uses: actions/upload-artifact@v4
id: evalsdb
with:
name: evals.db
path: evals.db
if-no-files-found: error

- name: Add evals comment to PR
uses: actions/github-script@v7
with:
script: |
const fs = require('fs')
const table = fs.readFileSync('evals.txt', 'utf8')
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: `# Evals\n\n${table}\n\n[Download evals.db](${{ steps.evalsdb.outputs.artifact-url }})`
})
lint:
name: Lint
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/cover.out
/.env*.local
/cover.out
evals.db
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ cover:

.PHONY: evaluate
evaluate:
go test -json -run TestEval ./... | jq 'select(.Test != null and .Action == "output" and (.Output | contains("result"))) | del(.Action)'
@go test -json -run TestEval ./... | evals

.PHONY: lint
lint:
25 changes: 24 additions & 1 deletion eval/run.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package eval

import (
"encoding/json"
"os"
"strings"
"testing"
@@ -70,10 +71,32 @@ func (e *E) Score(s Sample, scorer Scorer) Result {
return r
}

type logLine struct {
Sample Sample
Result Result
Duration time.Duration
}

// Log a [Sample] and [Result].
// This effectively logs the eval name, sample, and result, along with timing information.
// TODO include token information?
func (e *E) Log(s Sample, r Result) {
e.T.Helper()
e.T.Logf("sample=%+v result=%+v duration=%v", s, r, time.Since(e.start))

l := logLine{
Sample: s,
Result: r,
Duration: time.Since(e.start),
}

e.T.Log(mustJSON(l))
}

func mustJSON(l logLine) string {
b, err := json.Marshal(l)
if err != nil {
panic(err)
}

return string(b)
}
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@ require (
github.com/openai/openai-go v0.1.0-alpha.43
google.golang.org/api v0.214.0
maragu.dev/env v0.2.0
maragu.dev/evals v0.0.0-20250110091058-dfafc4ece961
maragu.dev/is v0.2.0
)

@@ -26,6 +27,8 @@ require (
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
github.com/googleapis/gax-go/v2 v2.14.0 // indirect
github.com/jmoiron/sqlx v1.4.0 // indirect
github.com/mattn/go-sqlite3 v1.14.24 // indirect
github.com/tidwall/gjson v1.14.4 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
@@ -46,4 +49,6 @@ require (
google.golang.org/genproto/googleapis/rpc v0.0.0-20241209162323-e6fa225c2576 // indirect
google.golang.org/grpc v1.67.1 // indirect
google.golang.org/protobuf v1.35.2 // indirect
maragu.dev/errors v0.3.0 // indirect
maragu.dev/migrate v0.6.0 // indirect
)
35 changes: 35 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -10,6 +10,8 @@ cloud.google.com/go/compute/metadata v0.6.0 h1:A6hENjEsCDtC1k8byVsgwvVcioamEHvZ4
cloud.google.com/go/compute/metadata v0.6.0/go.mod h1:FjyFAW1MW0C203CEOMDTu3Dk1FlqW3Rga40jzHL4hfg=
cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU=
cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng=
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/agnivade/levenshtein v1.2.0 h1:U9L4IOT0Y3i0TIlUIDJ7rVUziKi/zPbrJGaFrtYH3SY=
github.com/agnivade/levenshtein v1.2.0/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU=
github.com/anthropics/anthropic-sdk-go v0.2.0-alpha.8 h1:ss/c/eeyILgoK2sMsTJdcdLdhY3wZSt//+nanM41B9w=
@@ -27,6 +29,8 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
github.com/google/generative-ai-go v0.19.0 h1:R71szggh8wHMCUlEMsW2A/3T+5LdEIkiaHSYgSpUgdg=
github.com/google/generative-ai-go v0.19.0/go.mod h1:JYolL13VG7j79kM5BtHz4qwONHkeJQzOCkKXnpqtS/E=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
@@ -39,6 +43,29 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gT
github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA=
github.com/googleapis/gax-go/v2 v2.14.0 h1:f+jMrjBPl+DL9nI4IQzLUxMq7XrAqFYB7hBPqMNIe8o=
github.com/googleapis/gax-go/v2 v2.14.0/go.mod h1:lhBCnjdLrWRaPvLWhmc8IS24m9mr07qSYnHncrgo+zk=
github.com/jackc/chunkreader/v2 v2.0.1 h1:i+RDz65UE+mmpjTfyz0MoVTnzeYxroil2G82ki7MGG8=
github.com/jackc/chunkreader/v2 v2.0.1/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk=
github.com/jackc/pgconn v1.14.3 h1:bVoTr12EGANZz66nZPkMInAV/KHD2TxH9npjXXgiB3w=
github.com/jackc/pgconn v1.14.3/go.mod h1:RZbme4uasqzybK2RK5c65VsHxoyaml09lx3tXOcO/VM=
github.com/jackc/pgio v1.0.0 h1:g12B9UwVnzGhueNavwioyEEpAmqMe1E/BN9ES+8ovkE=
github.com/jackc/pgio v1.0.0/go.mod h1:oP+2QK2wFfUWgr+gxjoBH9KGBb31Eio69xUb0w5bYf8=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgproto3/v2 v2.3.3 h1:1HLSx5H+tXR9pW3in3zaztoEwQYRC9SQaYUHjTSUOag=
github.com/jackc/pgproto3/v2 v2.3.3/go.mod h1:WfJCnwN3HIg9Ish/j3sgWXnAfK8A9Y0bwXYU5xKaEdA=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgtype v1.14.0 h1:y+xUdabmyMkJLyApYuPj38mW+aAIqCe5uuBB51rH3Vw=
github.com/jackc/pgtype v1.14.0/go.mod h1:LUMuVrfsFfdKGLw+AFFVv6KtHOFMwRgDDzBt76IqCA4=
github.com/jackc/pgx/v4 v4.18.2 h1:xVpYkNR5pk5bMCZGfClbO962UIqVABcAGt7ha1s/FeU=
github.com/jackc/pgx/v4 v4.18.2/go.mod h1:Ey4Oru5tH5sB6tV7hDmfWFahwF15Eb7DNXlRKx2CkVw=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/openai/openai-go v0.1.0-alpha.43 h1:6XWGUsrHSaPyh8U6ocs/XJGb/UX7jhQRK2bYefvTuAg=
github.com/openai/openai-go v0.1.0-alpha.43/go.mod h1:3SdE6BffOX9HPEQv8IL/fi3LYZ5TUpRYaqGQZbyk11A=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -93,5 +120,13 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
maragu.dev/env v0.2.0 h1:nQKitDEB65ArZsh6E7vxzodOqY9bxEVFdBg+tskS1ys=
maragu.dev/env v0.2.0/go.mod h1:t5CCbaEnjCM5mewiAVVzTS4N+oXTus2+SRnzKQbQVME=
maragu.dev/errors v0.3.0 h1:huI+n+ddMfVgQFD+cEqIPaozUlfz3TkfgpkssNip5G0=
maragu.dev/errors v0.3.0/go.mod h1:cygLiyNnq4ofF3whYscilo2ecUADCaUQXwvwFrMOhmM=
maragu.dev/evals v0.0.0-20250110090454-18cd820491c8 h1:04jcfKTCUUW0iU1v6leNl3nWmE/w3l/es1niEjX8JRs=
maragu.dev/evals v0.0.0-20250110090454-18cd820491c8/go.mod h1:Qyj56mYe9ApUvas0pVm5M7BbxH5KSyMmifSDFyaUSco=
maragu.dev/evals v0.0.0-20250110091058-dfafc4ece961 h1:H7iZNzGQ7RZ+Se/OqiKsj4bCFUPPWhauk8iY8HtjwhQ=
maragu.dev/evals v0.0.0-20250110091058-dfafc4ece961/go.mod h1:Qyj56mYe9ApUvas0pVm5M7BbxH5KSyMmifSDFyaUSco=
maragu.dev/is v0.2.0 h1:poeuVEA5GG3vrDpGmzo2KjWtIMZmqUyvGnOB0/pemig=
maragu.dev/is v0.2.0/go.mod h1:bviaM5S0fBshCw7wuumFGTju/izopZ/Yvq4g7Klc7y8=
maragu.dev/migrate v0.6.0 h1:gJLAIVaRh9z9sN55Q2sWwScpEH+JsT6N0L1DnzedXFE=
maragu.dev/migrate v0.6.0/go.mod h1:TdZBD5wRvBbzLocsSV08kyvLiLCn0Q6DvgYHmyygWVQ=
3 changes: 3 additions & 0 deletions tools.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package llm

import _ "maragu.dev/evals/tool"

0 comments on commit 99660a1

Please sign in to comment.