Skip to content

Commit 112246e

Browse files
Add semantic similarity score to TestEvalLLMs (#37)
1 parent 555e283 commit 112246e

File tree

5 files changed

+168
-4
lines changed

5 files changed

+168
-4
lines changed

.golangci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
run:
2+
timeout: 5m

README.md

Lines changed: 140 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ Does your company depend on this project? [Contact me at markus@maragu.dk](mailt
1919

2020
## Usage
2121

22-
This test will only run with `go test -run TestEval ./...` and otherwise be skipped:
22+
Evals will only run with `go test -run TestEval ./...` and otherwise be skipped.
23+
24+
### Simple example
25+
26+
Eval a mocked LLM, construct a sample, score it with a lexical similarity scorer, and log the result.
2327

2428
```go
2529
package examples_test
@@ -66,3 +70,138 @@ func (l *powerfulLLM) Prompt(request string) string {
6670
return l.response
6771
}
6872
```
73+
74+
### Advanced example
75+
76+
This eval uses real LLMs (OpenAI GPT4o mini, Google Gemini 1.5 Flash, Anthropic 3.5 Haiku)
77+
and compares the response to an expected response using both lexical similarity (with Levenshtein distance)
78+
and semantic similarity (with an OpenAI embedding model and cosine similarity comparison).
79+
80+
```go
81+
package examples_test
82+
83+
import (
84+
"context"
85+
"errors"
86+
"fmt"
87+
"strings"
88+
"testing"
89+
90+
"github.com/anthropics/anthropic-sdk-go"
91+
"github.com/google/generative-ai-go/genai"
92+
"github.com/openai/openai-go"
93+
"github.com/openai/openai-go/shared"
94+
"maragu.dev/env"
95+
96+
"maragu.dev/llm"
97+
"maragu.dev/llm/eval"
98+
)
99+
100+
// TestEvalLLMs evaluates different LLMs with the same prompts.
101+
func TestEvalLLMs(t *testing.T) {
102+
_ = env.Load("../../.env.test.local")
103+
104+
tests := []struct {
105+
name string
106+
prompt func(prompt string) string
107+
expected string
108+
}{
109+
{
110+
name: "gpt-4o-mini",
111+
prompt: gpt4oMini,
112+
expected: "Hello! How can I assist you today?",
113+
},
114+
{
115+
name: "gemini-1.5-flash",
116+
prompt: gemini15Flash,
117+
expected: "Hi there! How can I help you today?",
118+
},
119+
{
120+
name: "claude-3.5-haiku",
121+
prompt: claude35Haiku,
122+
expected: "Hello! How are you doing today? Is there anything I can help you with?",
123+
},
124+
}
125+
126+
for _, test := range tests {
127+
eval.Run(t, test.name, func(e *eval.E) {
128+
input := "Hi!"
129+
output := test.prompt(input)
130+
131+
sample := eval.Sample{
132+
Input: input,
133+
Output: output,
134+
Expected: test.expected,
135+
}
136+
137+
result := e.Score(sample, eval.LexicalSimilarityScorer(eval.LevenshteinDistance))
138+
e.Log(sample, result)
139+
140+
result = e.Score(sample, eval.SemanticSimilarityScorer(&embeddingGetter{}, eval.CosineSimilarity))
141+
e.Log(sample, result)
142+
})
143+
}
144+
}
145+
146+
func gpt4oMini(prompt string) string {
147+
client := llm.NewOpenAIClient(llm.NewOpenAIClientOptions{Key: env.GetStringOrDefault("OPENAI_KEY", "")})
148+
res, err := client.Client.Chat.Completions.New(context.Background(), openai.ChatCompletionNewParams{
149+
Messages: openai.F([]openai.ChatCompletionMessageParamUnion{
150+
openai.UserMessage(prompt),
151+
}),
152+
Model: openai.F(openai.ChatModelGPT4oMini),
153+
Temperature: openai.F(0.0),
154+
})
155+
if err != nil {
156+
panic(err)
157+
}
158+
return res.Choices[0].Message.Content
159+
}
160+
161+
func gemini15Flash(prompt string) string {
162+
client := llm.NewGoogleClient(llm.NewGoogleClientOptions{Key: env.GetStringOrDefault("GOOGLE_KEY", "")})
163+
model := client.Client.GenerativeModel("models/gemini-1.5-flash-latest")
164+
var temperature float32 = 0
165+
model.Temperature = &temperature
166+
res, err := model.GenerateContent(context.Background(), genai.Text(prompt))
167+
if err != nil {
168+
panic(err)
169+
}
170+
return strings.TrimSpace(fmt.Sprint(res.Candidates[0].Content.Parts[0]))
171+
}
172+
173+
func claude35Haiku(prompt string) string {
174+
client := llm.NewAnthropicClient(llm.NewAnthropicClientOptions{Key: env.GetStringOrDefault("ANTHROPIC_KEY", "")})
175+
res, err := client.Client.Messages.New(context.Background(), anthropic.MessageNewParams{
176+
Messages: anthropic.F([]anthropic.MessageParam{
177+
anthropic.NewUserMessage(anthropic.NewTextBlock(prompt)),
178+
}),
179+
Model: anthropic.F(anthropic.ModelClaude3_5HaikuLatest),
180+
MaxTokens: anthropic.F(int64(1024)),
181+
Temperature: anthropic.F(0.0),
182+
})
183+
if err != nil {
184+
panic(err)
185+
}
186+
return fmt.Sprint(res.Content[0].Text)
187+
}
188+
189+
type embeddingGetter struct{}
190+
191+
func (e *embeddingGetter) GetEmbedding(v string) ([]float64, error) {
192+
client := llm.NewOpenAIClient(llm.NewOpenAIClientOptions{Key: env.GetStringOrDefault("OPENAI_KEY", "")})
193+
res, err := client.Client.Embeddings.New(context.Background(), openai.EmbeddingNewParams{
194+
Input: openai.F[openai.EmbeddingNewParamsInputUnion](shared.UnionString(v)),
195+
Model: openai.F(openai.EmbeddingModelTextEmbedding3Small),
196+
EncodingFormat: openai.F(openai.EmbeddingNewParamsEncodingFormatFloat),
197+
Dimensions: openai.F(int64(128)),
198+
})
199+
if err != nil {
200+
return nil, err
201+
}
202+
if len(res.Data) == 0 {
203+
return nil, errors.New("no embeddings returned")
204+
}
205+
return res.Data[0].Embedding, nil
206+
}
207+
```

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ require (
99
github.com/openai/openai-go v0.1.0-alpha.46
1010
google.golang.org/api v0.217.0
1111
maragu.dev/env v0.2.0
12-
maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c
12+
maragu.dev/evals v0.0.0-20250121095818-455e49387b21
1313
maragu.dev/is v0.2.0
1414
)
1515

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,8 @@ maragu.dev/env v0.2.0 h1:nQKitDEB65ArZsh6E7vxzodOqY9bxEVFdBg+tskS1ys=
128128
maragu.dev/env v0.2.0/go.mod h1:t5CCbaEnjCM5mewiAVVzTS4N+oXTus2+SRnzKQbQVME=
129129
maragu.dev/errors v0.3.0 h1:huI+n+ddMfVgQFD+cEqIPaozUlfz3TkfgpkssNip5G0=
130130
maragu.dev/errors v0.3.0/go.mod h1:cygLiyNnq4ofF3whYscilo2ecUADCaUQXwvwFrMOhmM=
131-
maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c h1:huPj1S5RhqgpbBAd3aCLfdVie3ZsU8Du7kepL2ZtDUQ=
132-
maragu.dev/evals v0.0.0-20250114114008-6c73fea1551c/go.mod h1:+2Y3dYZ6oANM+cL88kFxaPD1H7rq3FXOrI3NOeNKaZ8=
131+
maragu.dev/evals v0.0.0-20250121095818-455e49387b21 h1:Eg2DvonBz4eOPIhN+/aL1BXQAlNla4o+aBY+03e/6mA=
132+
maragu.dev/evals v0.0.0-20250121095818-455e49387b21/go.mod h1:uLfBl7/FhUJULS4PjmpMdNG+joMRYAxgMJbzGWhQhWE=
133133
maragu.dev/is v0.2.0 h1:poeuVEA5GG3vrDpGmzo2KjWtIMZmqUyvGnOB0/pemig=
134134
maragu.dev/is v0.2.0/go.mod h1:bviaM5S0fBshCw7wuumFGTju/izopZ/Yvq4g7Klc7y8=
135135
maragu.dev/migrate v0.6.0 h1:gJLAIVaRh9z9sN55Q2sWwScpEH+JsT6N0L1DnzedXFE=

internal/examples/hi_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,15 @@ package examples_test
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"strings"
78
"testing"
89

910
"github.com/anthropics/anthropic-sdk-go"
1011
"github.com/google/generative-ai-go/genai"
1112
"github.com/openai/openai-go"
13+
"github.com/openai/openai-go/shared"
1214
"maragu.dev/env"
1315

1416
"maragu.dev/llm"
@@ -53,7 +55,9 @@ func TestEvalLLMs(t *testing.T) {
5355
}
5456

5557
result := e.Score(sample, eval.LexicalSimilarityScorer(eval.LevenshteinDistance))
58+
e.Log(sample, result)
5659

60+
result = e.Score(sample, eval.SemanticSimilarityScorer(&embeddingGetter{}, eval.CosineSimilarity))
5761
e.Log(sample, result)
5862
})
5963
}
@@ -101,3 +105,22 @@ func claude35Haiku(prompt string) string {
101105
}
102106
return fmt.Sprint(res.Content[0].Text)
103107
}
108+
109+
type embeddingGetter struct{}
110+
111+
func (e *embeddingGetter) GetEmbedding(v string) ([]float64, error) {
112+
client := llm.NewOpenAIClient(llm.NewOpenAIClientOptions{Key: env.GetStringOrDefault("OPENAI_KEY", "")})
113+
res, err := client.Client.Embeddings.New(context.Background(), openai.EmbeddingNewParams{
114+
Input: openai.F[openai.EmbeddingNewParamsInputUnion](shared.UnionString(v)),
115+
Model: openai.F(openai.EmbeddingModelTextEmbedding3Small),
116+
EncodingFormat: openai.F(openai.EmbeddingNewParamsEncodingFormatFloat),
117+
Dimensions: openai.F(int64(128)),
118+
})
119+
if err != nil {
120+
return nil, err
121+
}
122+
if len(res.Data) == 0 {
123+
return nil, errors.New("no embeddings returned")
124+
}
125+
return res.Data[0].Embedding, nil
126+
}

0 commit comments

Comments
 (0)