diff --git a/README.md b/README.md index a2021daf..3a26fb41 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,19 @@ Stream GPT-5 chats with the Responses API, initiate Realtime WebRTC conversation - [Vision in a thread](#vision-in-a-thread) - [Runs involving function tools](#runs-involving-function-tools) - [Exploring chunks used in File Search](#exploring-chunks-used-in-file-search) + - [Evals](#evals) + - [Create an Eval](#create-an-eval) + - [Retrieve an Eval](#retrieve-an-eval) + - [List Evals](#list-evals) + - [Update an Eval](#update-an-eval) + - [Delete an Eval](#delete-an-eval) + - [Create an Eval Run](#create-an-eval-run) + - [List Eval Runs](#list-eval-runs) + - [Retrieve an Eval Run](#retrieve-an-eval-run) + - [Cancel an Eval Run](#cancel-an-eval-run) + - [Delete an Eval Run](#delete-an-eval-run) + - [List Output Items](#list-output-items) + - [Retrieve an Output Item](#retrieve-an-output-item) - [Image Generation](#image-generation) - [DALL·E 2](#dalle-2) - [DALL·E 3](#dalle-3) @@ -1669,6 +1682,267 @@ end.compact client.messages.list(thread_id: thread_id) ``` +### Evals + +The [Evals API](https://platform.openai.com/docs/api-reference/evals) allows you to systematically evaluate the quality and performance of your AI models. + +**Supported Endpoints:** +- `POST /v1/evals` - Create an evaluation +- `GET /v1/evals/{id}` - Retrieve an evaluation +- `GET /v1/evals` - List evaluations +- `POST /v1/evals/{id}` - Update an evaluation +- `DELETE /v1/evals/{id}` - Delete an evaluation +- `POST /v1/evals/{id}/runs` - Create an evaluation run +- `GET /v1/evals/{id}/runs/{run_id}` - Retrieve an evaluation run +- `GET /v1/evals/{id}/runs` - List evaluation runs +- `POST /v1/evals/{id}/runs/{run_id}/cancel` - Cancel an evaluation run +- `DELETE /v1/evals/{id}/runs/{run_id}` - Delete an evaluation run +- `GET /v1/evals/{id}/runs/{run_id}/output_items` - List output items +- `GET /v1/evals/{id}/runs/{run_id}/output_items/{item_id}` - Retrieve an output item + +#### Create an Eval + +Create an evaluation with testing criteria to assess model outputs: + +```ruby +response = client.evals.create( + parameters: { + name: "Sentiment Analysis Eval", + data_source_config: { + type: "stored_completions", + metadata: { usecase: "chatbot" } + }, + testing_criteria: [ + { + type: "label_model", + model: "o3-mini", + input: [ + { + role: "developer", + content: "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + role: "user", + content: "Statement: {{item.input}}" + } + ], + passing_labels: ["positive"], + labels: ["positive", "neutral", "negative"], + name: "Sentiment grader" + } + ], + metadata: { team: "product", version: "1.0" } + } +) +puts response["id"] +# => "eval_abc123" +``` + +#### Retrieve an Eval + +Get details about a specific evaluation: + +```ruby +eval_id = "eval_abc123" +response = client.evals.retrieve(id: eval_id) +puts response["name"] +# => "Sentiment Analysis Eval" +``` + +#### List Evals + +List all evaluations with optional pagination: + +```ruby +# List all evals +response = client.evals.list + +# List with limit +response = client.evals.list(parameters: { limit: 10 }) + +# List with pagination +response = client.evals.list(parameters: { after: "eval_abc123", limit: 20 }) +``` + +#### Update an Eval + +Update an evaluation's metadata: + +```ruby +response = client.evals.update( + id: eval_id, + parameters: { + metadata: { version: "2.0", updated: "true" } + } +) +``` + +#### Delete an Eval + +Delete an evaluation: + +```ruby +response = client.evals.delete(id: eval_id) +puts response["deleted"] +# => true +``` + +#### Create an Eval Run + +Run an evaluation against a model with test data: + +```ruby +response = client.evals.runs.create( + eval_id: eval_id, + parameters: { + name: "gpt-4o-mini baseline", + data_source: { + type: "completions", + input_messages: { + type: "template", + template: [ + { + role: "system", + content: "You are a sentiment analyzer. Respond with only: positive, neutral, or negative." + }, + { + role: "user", + content: "{{item.input}}" + } + ] + }, + sampling_params: { + temperature: 0.7, + max_completion_tokens: 50, + top_p: 1.0 + }, + model: "gpt-4o-mini", + source: { + type: "file_content", + content: [ + { + item: { + input: "I absolutely love this product! Best purchase ever.", + ground_truth: "positive" + } + }, + { + item: { + input: "This is terrible. Very disappointed.", + ground_truth: "negative" + } + }, + { + item: { + input: "It's okay, nothing special.", + ground_truth: "neutral" + } + } + ] + } + }, + metadata: { experiment: "baseline", date: "2024-01-15" } + } +) +puts response["id"] +# => "evalrun_xyz789" +``` + +#### List Eval Runs + +List all runs for a specific evaluation: + +```ruby +# List all runs +response = client.evals.runs.list(eval_id: eval_id) + +# List with limit +response = client.evals.runs.list( + eval_id: eval_id, + parameters: { limit: 10 } +) + +# List with pagination +response = client.evals.runs.list( + eval_id: eval_id, + parameters: { after: "evalrun_abc123", limit: 20 } +) +``` + +#### Retrieve an Eval Run + +Get details about a specific evaluation run: + +```ruby +run_id = "evalrun_xyz789" +response = client.evals.runs.retrieve( + eval_id: eval_id, + id: run_id +) +puts response["status"] +# => "completed" +``` + +#### Cancel an Eval Run + +Cancel a running evaluation: + +```ruby +response = client.evals.runs.cancel( + eval_id: eval_id, + id: run_id +) +puts response["status"] +# => "canceled" +``` + +#### Delete an Eval Run + +Delete an evaluation run: + +```ruby +response = client.evals.runs.delete( + eval_id: eval_id, + id: run_id +) +puts response["deleted"] +# => true +``` + +#### List Output Items + +Retrieve the output items from an evaluation run: + +```ruby +# List all output items +response = client.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id +) + +# List with pagination +response = client.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id, + parameters: { limit: 10, after: "item_abc123" } +) +``` + +#### Retrieve an Output Item + +Get details about a specific output item: + +```ruby +output_item_id = "item_abc123" +response = client.evals.runs.output_items.retrieve( + eval_id: eval_id, + run_id: run_id, + id: output_item_id +) +puts response["status"] +# => "pass" +``` + ### Image Generation Generate images using DALL·E 2 or DALL·E 3! diff --git a/lib/openai.rb b/lib/openai.rb index d5880c90..9128bfaf 100644 --- a/lib/openai.rb +++ b/lib/openai.rb @@ -22,6 +22,7 @@ require_relative "openai/batches" require_relative "openai/usage" require_relative "openai/conversations" +require_relative "openai/evals" module OpenAI class Error < StandardError; end diff --git a/lib/openai/client.rb b/lib/openai/client.rb index 6054af0f..c9127861 100644 --- a/lib/openai/client.rb +++ b/lib/openai/client.rb @@ -109,6 +109,10 @@ def conversations @conversations ||= OpenAI::Conversations.new(client: self) end + def evals + @evals ||= OpenAI::Evals.new(client: self) + end + def azure? @api_type&.to_sym == :azure end diff --git a/lib/openai/evals.rb b/lib/openai/evals.rb new file mode 100644 index 00000000..cb6927e0 --- /dev/null +++ b/lib/openai/evals.rb @@ -0,0 +1,75 @@ +module OpenAI + class Evals + def initialize(client:) + @client = client + end + + def create(parameters: {}) + @client.json_post(path: "/evals", parameters: parameters) + end + + def retrieve(id:) + @client.get(path: "/evals/#{id}") + end + + def update(id:, parameters: {}) + @client.json_post(path: "/evals/#{id}", parameters: parameters) + end + + def delete(id:) + @client.delete(path: "/evals/#{id}") + end + + def list(parameters: {}) + @client.get(path: "/evals", parameters: parameters) + end + + def runs + @runs ||= Runs.new(client: @client) + end + + class Runs + def initialize(client:) + @client = client + end + + def create(eval_id:, parameters: {}) + @client.json_post(path: "/evals/#{eval_id}/runs", parameters: parameters) + end + + def retrieve(eval_id:, id:) + @client.get(path: "/evals/#{eval_id}/runs/#{id}") + end + + def list(eval_id:, parameters: {}) + @client.get(path: "/evals/#{eval_id}/runs", parameters: parameters) + end + + def cancel(eval_id:, id:) + @client.post(path: "/evals/#{eval_id}/runs/#{id}/cancel") + end + + def delete(eval_id:, id:) + @client.delete(path: "/evals/#{eval_id}/runs/#{id}") + end + + def output_items + @output_items ||= OutputItems.new(client: @client) + end + + class OutputItems + def initialize(client:) + @client = client + end + + def list(eval_id:, run_id:, parameters: {}) + @client.get(path: "/evals/#{eval_id}/runs/#{run_id}/output_items", parameters: parameters) + end + + def retrieve(eval_id:, run_id:, id:) + @client.get(path: "/evals/#{eval_id}/runs/#{run_id}/output_items/#{id}") + end + end + end + end +end diff --git a/spec/fixtures/cassettes/evals_create.yml b/spec/fixtures/cassettes/evals_create.yml new file mode 100644 index 00000000..919537ef --- /dev/null +++ b/spec/fixtures/cassettes/evals_create.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:43 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_ffb92a5fdd9ce2220567f011a43dba2c + Openai-Processing-Ms: + - '555' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '557' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=G4GcoNWM5rT0kajORKFsYNgcXAz8HMsb5rZ8Zzkw_LI-1764265063-1.0.1.1-Dp9PBpRTcl2U.a5OAYevFIabxr3OC6hE7.9O3KIiWz6tB1_9SQ1VMKna9wP6_3b8KkVh3uQtSLsQ0_BXDzRXOfZyUuFUaoabN67UbMP4q64; + path=/; expires=Thu, 27-Nov-25 18:07:43 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=x0rOeErtaRsMGn4UZqdIWs3b9skWEly1bRr442iHDDE-1764265063034-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53651eddc83c84-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c66d7bc8191a3a478e4e5b174f7", + "object": "eval", + "created_at": 1764265062, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-8c5ece96-5e5d-4051-87a1-9af9a668be70", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:43 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_delete.yml b/spec/fixtures/cassettes/evals_delete.yml new file mode 100644 index 00000000..4760d4fd --- /dev/null +++ b/spec/fixtures/cassettes/evals_delete.yml @@ -0,0 +1,74 @@ +--- +http_interactions: +- request: + method: delete + uri: https://api.openai.com/v1/evals/eval_69288c753da081918baccc440dfb3cea + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:57 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_5b2c7c66080f297227416bf472b45866 + Openai-Processing-Ms: + - '267' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '269' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=FDivFwBSqSQaDjz.Sv2hSLAw1c.5lyvVl0nLrxBNXHo-1764265077-1.0.1.1-KkM8oLJRZq5PU2QFlnlZFTTWvrS77ZQkSbdDyJTCxO_5sOqJZ8VB8KXJw4lN0JxBa06O3AaCfEBncyzc78mA7scuegGDTEebpLCiXE4iF3U; + path=/; expires=Thu, 27-Nov-25 18:07:57 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=I7CzEHguum6GIZRgNj0yySn3bEm20mjveBzKgnyKSA0-1764265077911-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53657e4af94d65-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "eval.deleted", + "deleted": true, + "eval_id": "eval_69288c753da081918baccc440dfb3cea" + } + recorded_at: Thu, 27 Nov 2025 17:37:57 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_delete_setup.yml b/spec/fixtures/cassettes/evals_delete_setup.yml new file mode 100644 index 00000000..2afd5f32 --- /dev/null +++ b/spec/fixtures/cassettes/evals_delete_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:57 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_a48649a5da6f373a7dd1c03ad7727a2d + Openai-Processing-Ms: + - '181' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '184' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=gaAv6K1dIpaWENMBQCC5oWFC7WYA15BUV5ufk1SWx_Q-1764265077-1.0.1.1-_Rc0A7giRZmUc2b5hF6pZCpHFLyZscP93icF_6Lp9NDWaJhiafMjpahB2ETzDa9c6vS1QBMbHTvgr04kIRJZpwg7DX8lHQ4Mwbm6d5z6S7E; + path=/; expires=Thu, 27-Nov-25 18:07:57 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=1rBJq8YPuzzVcpl.U_aK0HikFl_72LTQ.QmqttbpqPk-1764265077383-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53657b9e2fcc9e-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c753da081918baccc440dfb3cea", + "object": "eval", + "created_at": 1764265077, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-a4525dea-03ff-497b-bdd4-7d1d1bdbb3d1", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:57 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_list.yml b/spec/fixtures/cassettes/evals_list.yml new file mode 100644 index 00000000..40a5f998 --- /dev/null +++ b/spec/fixtures/cassettes/evals_list.yml @@ -0,0 +1,282 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 18:53:28 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_fb43ab73c5cedacfd3da4797f457f98f + Openai-Processing-Ms: + - '219' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '221' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=KRMsUubnpJCqt86K8dPwv6XLqF3.4SbEgig4dkt7RRI-1764269608-1.0.1.1-_G07YEtDCfSxGTL9A8U8sbKS89nOibkLGjMQX4Jldo0PVN8nGeQ2Zwi4BbbkQZeADFqI8PULxKJ4FjeGEqUXLVe4Aydb4r.pit5qX.WyHpc; + path=/; expires=Thu, 27-Nov-25 19:23:28 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=5YZOeZ8X7F73apsyWvgmXwsk44AQOl0d3728FzPciXA-1764269608650-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53d41a0bc70764-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [ + { + "id": "eval_69289e005d008191bc07606b2ceb522c", + "object": "eval", + "created_at": 1764269568, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "role": { + "enum": [ + "assistant" + ], + "type": "string" + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-88f5e332-d4f9-4843-aae1-918789dba587", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + ], + "first_id": "eval_69289e005d008191bc07606b2ceb522c", + "has_more": false, + "last_id": "eval_69289e005d008191bc07606b2ceb522c" + } + recorded_at: Thu, 27 Nov 2025 18:53:28 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_list_setup.yml b/spec/fixtures/cassettes/evals_list_setup.yml new file mode 100644 index 00000000..36331d54 --- /dev/null +++ b/spec/fixtures/cassettes/evals_list_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 18:52:48 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_60d32089805131c0071ce9eccb04177f + Openai-Processing-Ms: + - '550' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '552' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=eFEINPHuc34CMCYL.SDiqLlyW4rJ.vafcOPwqC2BIDE-1764269568-1.0.1.1-CSW4VNVaa1sxqRqpfLPzzndPm4jSUTy_Rv8_rGTkh1X7cM1u91acPyUt_uboA44es5iyv.HJrprMsbi1okkD596sIKD189Iw3ijBcKj2i0A; + path=/; expires=Thu, 27-Nov-25 19:22:48 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=1N0rpQDL2dtXgQFGNlVf.SVMbsPNPJg58wihf2nOrvM-1764269568498-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53d31ebd220764-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69289e005d008191bc07606b2ceb522c", + "object": "eval", + "created_at": 1764269568, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-88f5e332-d4f9-4843-aae1-918789dba587", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 18:52:48 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_retrieve.yml b/spec/fixtures/cassettes/evals_retrieve.yml new file mode 100644 index 00000000..aef193a5 --- /dev/null +++ b/spec/fixtures/cassettes/evals_retrieve.yml @@ -0,0 +1,274 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c65203881918b2f062b30cd7aa9 + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:42 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_34dbc89ee9874a57aa011f2e89ff09c1 + Openai-Processing-Ms: + - '64' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '67' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=u9ZriP.qwIAjZx8EdcGA_lIdQSy3JoUCEUCsrPXIkpA-1764265062-1.0.1.1-LNP5FQOOpKBdaIc3C2ccuYy.iL1pp_bMijvzoBpBvlWXNbP0g8wxMdoCOzPwpuOjSF8v3iva6oKWgOm89yNOz6qPnPaZSgHd9uPKbeJYKP4; + path=/; expires=Thu, 27-Nov-25 18:07:42 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=VCPQsuc9dpzChvitHps0wIlQBQgFbIHfaNT3JZ0f3XA-1764265062088-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365199dad35c5-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c65203881918b2f062b30cd7aa9", + "object": "eval", + "created_at": 1764265061, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-4e90c6d8-f26c-4cb4-a339-41b1b1bb0532", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:42 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_retrieve_setup.yml b/spec/fixtures/cassettes/evals_retrieve_setup.yml new file mode 100644 index 00000000..af74513e --- /dev/null +++ b/spec/fixtures/cassettes/evals_retrieve_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:41 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_93497c36b5e84c57ad0a8e3d9de98b23 + Openai-Processing-Ms: + - '560' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '563' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=pLXAPhTauwRvA5Vu6GbOt1q0_W6MjmZflR3Trnvy8Z4-1764265061-1.0.1.1-N39qqUPxahcowOUZhs4ckWv3aC_u8dg2t9h8L492QtXIcewUsoTNAwIACQ6T8z8JFW6AWgAWg0AfaTZC5aEnahw312LxhY4ItvhkktWXlxM; + path=/; expires=Thu, 27-Nov-25 18:07:41 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=GzrCuuyexD.2XVBCWz_QtDE76Mqhxn7C3kVHuxbsHNY-1764265061277-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365124ca72216-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c65203881918b2f062b30cd7aa9", + "object": "eval", + "created_at": 1764265061, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-4e90c6d8-f26c-4cb4-a339-41b1b1bb0532", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:41 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_cancel.yml b/spec/fixtures/cassettes/evals_runs_cancel.yml new file mode 100644 index 00000000..7510d981 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_cancel.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c7132e48191ad7ff46c32cf1c46/runs/evalrun_69288c71b74c819183f7e7ed01b4d5ff/cancel + body: + encoding: UTF-8 + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Content-Length: + - '0' + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:54 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_9ce39fdd8e538dc9a6878bf4d0daa643 + Openai-Processing-Ms: + - '670' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '672' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=4b4Q3YWwvFU9VANtkY3qjcqR_gDeiqYZCwJIyxsrEjU-1764265074-1.0.1.1-ymldn.MFe8F6uLfJ4vp5MhutNsXSjTy.gBbXOCzxLpiVbDIzmkPqtJS6g8.yD86fizxh3Qf.31nb7jxqc0v62ntUCa8wtPt2laRT4wtO7SE; + path=/; expires=Thu, 27-Nov-25 18:07:54 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=kVuspCEagYsUYNL.F9tTWd5XOiXGUoSC6cFPRSIZe.s-1764265074927-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365693b274f47-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "object": "eval.run", + "created_at": 1764265073, + "status": "canceled", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c7132e48191ad7ff46c32cf1c46", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c7132e48191ad7ff46c32cf1c46?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:54 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml b/spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml new file mode 100644 index 00000000..dba503e1 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c7132e48191ad7ff46c32cf1c46/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:54 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_0093cd2118771c8e402a60b5407cc55a + Openai-Processing-Ms: + - '406' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '408' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=uM0mV1hkYyLOKIK2GYHYsHFckJcIb2swTCF8MGpNWRQ-1764265074-1.0.1.1-MVHCmdcMYkqpRZoXUHf1Xaqx4uSf2gOzKUZef1C2cA11076.c5EnrNdYpkEC9Plv.aSuHmwDcU8ymp8rmkRfLcdBBpKEldY1LB4VhoKgS1A; + path=/; expires=Thu, 27-Nov-25 18:07:54 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=4CiPhk45QtL_dsOtL13kGKXPphdXg2Qyj1NKAWiyHTI-1764265074004-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536564fcdcb0fb-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "object": "eval.run", + "created_at": 1764265073, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c7132e48191ad7ff46c32cf1c46", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c7132e48191ad7ff46c32cf1c46?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:53 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_cancel_setup.yml b/spec/fixtures/cassettes/evals_runs_cancel_setup.yml new file mode 100644 index 00000000..a25921ad --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_cancel_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:53 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_95082c9c904d2480498f6f2999b250ac + Openai-Processing-Ms: + - '178' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '181' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=a7Te_D7MzgN1F9UQdvVgcdF66cnb72ZVkd7UpZRWGEE-1764265073-1.0.1.1-eYABxh1jILbTiLG6XrF.sWnR_vYqKaf7DPw91z9BFU_NJHkWacIIj5NEgutT3e0Rn.2VhJ1TXNP3vkFcu6rE1cqQsrOp_OwWQ6.JR4IOpdU; + path=/; expires=Thu, 27-Nov-25 18:07:53 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=.Ivd8N1lBfEG2ehGEYjU_vQM.irXnKU.iXYfQEgtkqU-1764265073337-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365625eb4a935-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c7132e48191ad7ff46c32cf1c46", + "object": "eval", + "created_at": 1764265073, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-27eb57ce-63cc-42fb-aa8b-6f3ab2cf73b8", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:53 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_create.yml b/spec/fixtures/cassettes/evals_runs_create.yml new file mode 100644 index 00000000..906b6003 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_create.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6ba2348191be05c2d02d0e0c0b/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:48 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_a227f8a26a2131d8473e1c92074eec46 + Openai-Processing-Ms: + - '760' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '763' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=9ZswAkI3q8s5c6ThS5HjOpifSgedQWKidh4TOo7iv3w-1764265068-1.0.1.1-5Yims_ZHAyjNs7xrqmoYvMK3buvL3vGW1YIhz5oCF.FUvl1TnFPMg4zlbLZ7ScgFbFNlFTrwtzN8lCVxXd3.SjsaGt3sI00xabiVGrQdZfE; + path=/; expires=Thu, 27-Nov-25 18:07:48 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=WZSrX35UqN8Z.mHJ69EsSjq5J3..4kRCvd91oQHmxCI-1764265068789-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365422eff2210-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6c25fc8191ac632b505a3ff1c9", + "object": "eval.run", + "created_at": 1764265068, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6ba2348191be05c2d02d0e0c0b", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6ba2348191be05c2d02d0e0c0b?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6c25fc8191ac632b505a3ff1c9", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:48 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_create_setup.yml b/spec/fixtures/cassettes/evals_runs_create_setup.yml new file mode 100644 index 00000000..a866fb21 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_create_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:47 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_96f911be32e1840672acd68fd0dea03f + Openai-Processing-Ms: + - '234' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '237' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=iSICdk.EdIDyle.RQysHEXmyES4qOD6qYZrmaELQLDA-1764265067-1.0.1.1-_E3nGyLYoebWy.Ed9MMjOgHi28ZepbGSTPvdl7w161O2ex_rTGDSlvf2p5c_uOnYMNQRWXYJ.HxGByHkU4miSAhurDxSKB.6hDN8ZIOX_mg; + path=/; expires=Thu, 27-Nov-25 18:07:47 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=vOcml4pSfdEN84KBG_RBgnPJ2nrh14amro7Q1K0d4xs-1764265067769-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53653f0c150c19-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6ba2348191be05c2d02d0e0c0b", + "object": "eval", + "created_at": 1764265067, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-9474cccc-047e-4ddf-bb6f-b983ca6aef0d", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:47 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_delete.yml b/spec/fixtures/cassettes/evals_runs_delete.yml new file mode 100644 index 00000000..45012dc0 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_delete.yml @@ -0,0 +1,196 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288d369be48191a9dd469d6cb22c9f/runs/evalrun_69288d37912c81919e794ac84108363b/cancel + body: + encoding: UTF-8 + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Content-Length: + - '0' + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:41:13 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_0e5f5722ba2a4e819f366b5b08f03d05 + Openai-Processing-Ms: + - '555' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '558' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=HUQa2g4EML8HJ0fSYEyP8Y1LvFWAs0vMtx_2K2jXggA-1764265273-1.0.1.1-wLir27KcO3TJiEsJtTgjKpw02lSnGmQNyqlkZ4ljpvWaIsI5f4TrbxqT2Fi9klEzlfvGbpNyTaVJjcazBJ0Cu5Ezr_yl_Rns_F_9yxrGoFM; + path=/; expires=Thu, 27-Nov-25 18:11:13 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=VPa8xE7AHfpB9ZpDZZ8Tb0x8BiBplf_CE5ylsOX4FZE-1764265273069-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a403929eb17-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288d37912c81919e794ac84108363b", + "object": "eval.run", + "created_at": 1764265271, + "status": "canceled", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288d369be48191a9dd469d6cb22c9f", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288d369be48191a9dd469d6cb22c9f?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288d37912c81919e794ac84108363b", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:41:13 GMT +- request: + method: delete + uri: https://api.openai.com/v1/evals/eval_69288d369be48191a9dd469d6cb22c9f/runs/evalrun_69288d37912c81919e794ac84108363b + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:41:14 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_f31add9c41781b9ef7f694152f59dadf + Openai-Processing-Ms: + - '763' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '767' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=f9nvHlPi3UV7AK.TJumxczUKk1dIQDhExsuDFexiC.o-1764265274-1.0.1.1-GRqhzByoSCF1KKz1tZXP23mtv9XboeAZfYge3kRkZ2xpVGDqAuxUOpqK8oxTahqMEYXBH1NvIDolFuhNGFmlJsbU2s_qGdYlnjxe5odg_Ss; + path=/; expires=Thu, 27-Nov-25 18:11:14 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=kt_WqBTaaNSKPkhG35J4JsD8KAwC3rcdQEFaxw2b_do-1764265274323-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a454a12b140-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "eval.run.deleted", + "deleted": true, + "run_id": "evalrun_69288d37912c81919e794ac84108363b" + } + recorded_at: Thu, 27 Nov 2025 17:41:14 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_delete_run_setup.yml b/spec/fixtures/cassettes/evals_runs_delete_run_setup.yml new file mode 100644 index 00000000..827634fd --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_delete_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288d369be48191a9dd469d6cb22c9f/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:41:12 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_992f53ec833344ac9d252a854242aad9 + Openai-Processing-Ms: + - '1074' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '1077' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=pDzZar7eZ1cWdnVwvzRNA5pYoyYTAiFvoayFkrLokkA-1764265272-1.0.1.1-kU3SDYb6dj0Qzau6KPvdiPo5Z47dqdwFaMyhjpDaOQTtoNYKjMiuP.KEH.2iKgH43P.beXjXno_HWnkeoNJl5zm4vVXQT15LoUKrnGq55YE; + path=/; expires=Thu, 27-Nov-25 18:11:12 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=ptb8p7NZ79WAj3ZgZcOYPNFxCtjgfJHstwiuvU1ZPrw-1764265272257-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a36bf98a935-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288d37912c81919e794ac84108363b", + "object": "eval.run", + "created_at": 1764265271, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288d369be48191a9dd469d6cb22c9f", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288d369be48191a9dd469d6cb22c9f?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288d37912c81919e794ac84108363b", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:41:12 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_delete_setup.yml b/spec/fixtures/cassettes/evals_runs_delete_setup.yml new file mode 100644 index 00000000..88d8195d --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_delete_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:41:10 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_e6cd656f098a4c5383491d45df4c6bdc + Openai-Processing-Ms: + - '210' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '213' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=eoMCgwTjmGa8TpChmbY35gYtGNsdnvJDG9Oy_HucA3A-1764265270-1.0.1.1-nu7guHa3w50bOYH9x7yVao20MEVGw4A6WC6gnPKshlJDGsawMg9XLgkgmeG0jhE.kQIM6LLGI_Dsm56wwalEAdBPK5Y9RzVtmPsZ.3CjgDo; + path=/; expires=Thu, 27-Nov-25 18:11:10 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=wvlyhu5ZmXw1nXW0.HaQuloGBXKaYEUGpYRztDTf63U-1764265270748-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a330a47b367-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288d369be48191a9dd469d6cb22c9f", + "object": "eval", + "created_at": 1764265270, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-44de3f73-0711-4426-8b59-cb26720e53f5", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:41:10 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_list.yml b/spec/fixtures/cassettes/evals_runs_list.yml new file mode 100644 index 00000000..5fdc435f --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_list.yml @@ -0,0 +1,131 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_6928994e3c788191aa5575493ab58226/runs + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 18:32:48 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_4014e25ebde4430ae5ff17e80598937c + Openai-Processing-Ms: + - '310' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '313' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=RCxCb9hNoDNLsT8SSz.QWaZ16OE3wz0H2OOVgObF0Hg-1764268368-1.0.1.1-xbuw3j1YAdWfl2qFpzspmbbwU220pO9LL4W14d4GqWvMorzobwUPp373M6RGG4obrYtV.kSFHFeBERs2yiVxllmvySVXHgXIeF1WqLf99zc; + path=/; expires=Thu, 27-Nov-25 19:02:48 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=69e_irZTWWiJst8RToJInC_xiIGqs6FHMv9GIZeo2cA-1764268368400-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53b5d36820cd1a-LHR + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [ + { + "id": "evalrun_6928994f39d88191b9a47a69de5eda51", + "object": "eval.run", + "created_at": 1764268367, + "status": "in_progress", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_6928994e3c788191aa5575493ab58226", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_6928994e3c788191aa5575493ab58226?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_6928994f39d88191b9a47a69de5eda51", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + ], + "first_id": "evalrun_6928994f39d88191b9a47a69de5eda51", + "has_more": false, + "last_id": "evalrun_6928994f39d88191b9a47a69de5eda51" + } + recorded_at: Thu, 27 Nov 2025 18:32:48 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_list_run_setup.yml b/spec/fixtures/cassettes/evals_runs_list_run_setup.yml new file mode 100644 index 00000000..8e749f19 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_list_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_6928994e3c788191aa5575493ab58226/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 18:32:47 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_21708b12e8268ffa47de3abd79ef2344 + Openai-Processing-Ms: + - '1031' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '1034' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=i_CIB19VZtHAv_rk8skB4htrkC04jLpYcMScJYKzV_E-1764268367-1.0.1.1-y91vztSj2kUhklMWvqJgYdGsGk3Y.SfmrPoB7FAykNzgLIYqeZaZo12Df_dARe4utNy7jWI0novwYwAHqlaiVppfxpCHqlTThEom5TV3eOY; + path=/; expires=Thu, 27-Nov-25 19:02:47 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=h_NO79XlA6.gBFloHWFMOow1Kg0nS3yu.4GAJNFY6A0-1764268367770-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53b5caee33dfb4-LHR + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_6928994f39d88191b9a47a69de5eda51", + "object": "eval.run", + "created_at": 1764268367, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_6928994e3c788191aa5575493ab58226", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_6928994e3c788191aa5575493ab58226?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_6928994f39d88191b9a47a69de5eda51", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 18:32:47 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_list_setup.yml b/spec/fixtures/cassettes/evals_runs_list_setup.yml new file mode 100644 index 00000000..d58f89ff --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_list_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 18:32:46 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_7761586e9b3dc966280851030e885ca3 + Openai-Processing-Ms: + - '687' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '689' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=z3MpTJQQJDq8NpDy1Mgky8PcQ917uOEFzRSZzw_TX90-1764268366-1.0.1.1-2l4GXx9_Ul92nzE08ZGg4Re_dYImAYbYaL7O5z_qzmA067mtyGUhPYNQcLmINwRl76cW14HWWE9cMJheDF7xor28wYGbW1SjWY4D4_s0wxI; + path=/; expires=Thu, 27-Nov-25 19:02:46 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=qkT5usGvz0OmPbVRs3HPJ7EptZZee9PslFmbV9UvVac-1764268366427-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53b5c47d22f816-LHR + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_6928994e3c788191aa5575493ab58226", + "object": "eval", + "created_at": 1764268366, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-2e66f46a-0407-4cc5-bd42-a1c5cde44f6c", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 18:32:46 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list.yml b/spec/fixtures/cassettes/evals_runs_output_items_list.yml new file mode 100644 index 00000000..c0e38c0d --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list.yml @@ -0,0 +1,76 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6d54f48191bb18d415547ff09c/runs/evalrun_69288c6e2dac819181b017027cd1d2ba/output_items + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:51 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_044e8ae19b15a7101a63c51d97527437 + Openai-Processing-Ms: + - '380' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '382' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=Ij1_ZQwKnjbo57fg2wgjLL6wo6ykZnDO32Ca0ushv18-1764265071-1.0.1.1-b0Zelat_yRyQHGzvXuBH7A62cjICDnwr8.KXH1ZNQ8nhk9qYmI4U3xE.6e6CojNe5CAODZVFAWymziugL4BG96hgHnpMUmgZzkh.0aCpR9M; + path=/; expires=Thu, 27-Nov-25 18:07:51 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=5kFavy3PInCs8S45ZxHlLdbcTlGBpxX80ap0mbCmaGc-1764265071138-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365532c7ab128-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [], + "first_id": null, + "has_more": false, + "last_id": null + } + recorded_at: Thu, 27 Nov 2025 17:37:51 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml new file mode 100644 index 00000000..4c4d5055 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml @@ -0,0 +1,76 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_692886c4402c8191b16daf0a13927d55/runs/evalrun_692886c4cb7c8191a6d061543b6c0224/output_items + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:13:41 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_03cd0ef2eb6148bb9db8a92286faa23a + Openai-Processing-Ms: + - '224' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '228' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=6MUOh_RSBEwcm8ZA61n6mZa9_Wlfc.sKgtkfiFWcBjA-1764263621-1.0.1.1-itCbxTpT_mHZQI1ZXkaSUhKMXrMSXzYfYyQQ6PmQ56wgvn7uZ862F8GJ9vlxbxvngqovWvP8hgIFeb_iO5RRWbraUfIQSd4DVNPiQtwzIVI; + path=/; expires=Thu, 27-Nov-25 17:43:41 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=jphYlrigl5iW9Plhyq3CwLH4hYQY1Qp8MYaUJmIgoqc-1764263621721-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5341f08cb6220e-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [], + "first_id": null, + "has_more": false, + "last_id": null + } + recorded_at: Thu, 27 Nov 2025 17:13:41 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml new file mode 100644 index 00000000..d75a849b --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6d54f48191bb18d415547ff09c/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:50 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_88e93be9ef8d9e6a97cc9becd3e56a22 + Openai-Processing-Ms: + - '441' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '444' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=yn01YpoTluz_g0dCfOJu029HxxIE0YxtsV_IDxPV0cA-1764265070-1.0.1.1-RGZeYvFOEMNrnyrsOeT2CRd0vOtSqedJVlTGy7CyAvS2Gc9xzTqqGHOlPIQnWUpK77v2rogZPxT1htz3zdV7c0ACVCmsva90aIUPOPh8sh0; + path=/; expires=Thu, 27-Nov-25 18:07:50 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=zo1jRkukrqcGsIqmCJTtvx_F..hdTyvBGAcHj4x.aEo-1764265070481-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53654ceffb2196-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6e2dac819181b017027cd1d2ba", + "object": "eval.run", + "created_at": 1764265070, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6d54f48191bb18d415547ff09c", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6d54f48191bb18d415547ff09c?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6e2dac819181b017027cd1d2ba", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:50 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml new file mode 100644 index 00000000..c1a0cf44 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:49 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_900410bfa9674eb2bb47cbf185769adc + Openai-Processing-Ms: + - '184' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '187' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=HULhDfLfjW5mhRIsax.joyijyBBd7EmO2Q3O_IVs7sE-1764265069-1.0.1.1-U_UGCd9.7V5Vj1C.0tk_ra2ZeJ8wT6gajpUlS1MtflDyxwh.9EhYL1aVQCE7pMrSuLNCSyFtkGCltY2l.WqoKyBI0udBVF18gFbumUpDkZk; + path=/; expires=Thu, 27-Nov-25 18:07:49 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=xXc2dl6GdscUZ1VH5HMyd84ZVjDXG6GEdzqczQKDTQA-1764265069472-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365488bf0cca5-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6d54f48191bb18d415547ff09c", + "object": "eval", + "created_at": 1764265069, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-abb2f7c9-703e-4ea7-93fa-a96fa580dcd4", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:49 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml b/spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml new file mode 100644 index 00000000..224a38d7 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml @@ -0,0 +1,303 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6f782c8191900a5cb6be3db61e/runs/evalrun_69288c70a03881919438f1de10070910/output_items + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:57:40 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_63f77af99d92f4ad3817b3439f12de1c + Openai-Processing-Ms: + - '386' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '388' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=3KuybbQ5ivkgrdoN8P1j2dPObVied6ifNmVh4kUsZbQ-1764266260-1.0.1.1-yfOh7gnUJL3FHVOt4IEgFaKUZldHA2wK6vJofU4dU.x11w9ng8sRVPOrbK4ASfW2RI3L86PErGFm7tv3qy9VUza6CNSyWLgJxcSynLbK9z0; + path=/; expires=Thu, 27-Nov-25 18:27:40 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=jA56LhejCounC.ZFpgofvJ2Nk9OTb5HVXFeLiqACL1U-1764266260747-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53825d6a22eb1d-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [ + { + "id": "outputitem_69288c7485688191b3f81ee02a17a2b8", + "object": "eval.run.output_item", + "created_at": 1764265076, + "status": "pass", + "_datasource_item_content_hash": "07bd0d39b771a2e3976c536264799dc3f2b6e5e943a8d68dc3058bac176de445", + "available_includes": [], + "datasource_item": { + "input": "I love this product!", + "ground_truth": "positive" + }, + "datasource_item_id": 0, + "eval_id": "eval_69288c6f782c8191900a5cb6be3db61e", + "results": [ + { + "name": "Sentiment grader-32bb5cf1-b6c0-4030-ba29-10a1dca004ac", + "score": 1.0, + "passed": true, + "sample": { + "input": [ + { + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "role": "user", + "content": "Statement: I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "{\n \"steps\": [\n {\n \"description\": \"The statement 'I love this product!' includes the word 'love,' indicating a strong positive emotion towards the product.\",\n \"conclusion\": \"The sentiment is positive.\"\n }\n ],\n \"result\": \"positive\"\n}" + } + ], + "finish_reason": "stop", + "model": "o3-mini-2025-01-31", + "usage": { + "total_tokens": 287, + "completion_tokens": 137, + "prompt_tokens": 150, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + ], + "run_id": "evalrun_69288c70a03881919438f1de10070910", + "sample": { + "input": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "That's great to hear! What product are you referring to? I'd love to know more about it and what you enjoy about it!" + } + ], + "finish_reason": "stop", + "model": "gpt-4o-mini-2024-07-18", + "usage": { + "total_tokens": 48, + "completion_tokens": 26, + "prompt_tokens": 22, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + ], + "first_id": "outputitem_69288c7485688191b3f81ee02a17a2b8", + "has_more": false, + "last_id": "outputitem_69288c7485688191b3f81ee02a17a2b8" + } + recorded_at: Thu, 27 Nov 2025 17:57:40 GMT +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6f782c8191900a5cb6be3db61e/runs/evalrun_69288c70a03881919438f1de10070910/output_items/outputitem_69288c7485688191b3f81ee02a17a2b8 + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:57:41 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_96ab283a68914e5aae9e838825b84daa + Openai-Processing-Ms: + - '275' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '277' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=MpuILeElqrljrDl2kgh.q2CvWdew2aULy1VsYpIJLNw-1764266261-1.0.1.1-FOnzEQiiVXTlMCOaCE7fFSD2AiAu_M8r__8x6H8TAdj1_u0cNrAVEidH2LWeOVx022TfA4Qojh2ARuZcSxvIArc6GC0PpDUH_lV_7NpqB34; + path=/; expires=Thu, 27-Nov-25 18:27:41 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=qAK71GovjGWZA0BgqVhkKve3h_zFGkBk0nJgN3qfOzY-1764266261401-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5382624872b134-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "outputitem_69288c7485688191b3f81ee02a17a2b8", + "object": "eval.run.output_item", + "created_at": 1764265076, + "status": "pass", + "_datasource_item_content_hash": "07bd0d39b771a2e3976c536264799dc3f2b6e5e943a8d68dc3058bac176de445", + "available_includes": [], + "datasource_item": { + "input": "I love this product!", + "ground_truth": "positive" + }, + "datasource_item_id": 0, + "eval_id": "eval_69288c6f782c8191900a5cb6be3db61e", + "results": [ + { + "name": "Sentiment grader-32bb5cf1-b6c0-4030-ba29-10a1dca004ac", + "score": 1.0, + "passed": true, + "sample": { + "input": [ + { + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "role": "user", + "content": "Statement: I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "{\n \"steps\": [\n {\n \"description\": \"The statement 'I love this product!' includes the word 'love,' indicating a strong positive emotion towards the product.\",\n \"conclusion\": \"The sentiment is positive.\"\n }\n ],\n \"result\": \"positive\"\n}" + } + ], + "finish_reason": "stop", + "model": "o3-mini-2025-01-31", + "usage": { + "total_tokens": 287, + "completion_tokens": 137, + "prompt_tokens": 150, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + ], + "run_id": "evalrun_69288c70a03881919438f1de10070910", + "sample": { + "input": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "That's great to hear! What product are you referring to? I'd love to know more about it and what you enjoy about it!" + } + ], + "finish_reason": "stop", + "model": "gpt-4o-mini-2024-07-18", + "usage": { + "total_tokens": 48, + "completion_tokens": 26, + "prompt_tokens": 22, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + recorded_at: Thu, 27 Nov 2025 17:57:41 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml new file mode 100644 index 00000000..b0641740 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6f782c8191900a5cb6be3db61e/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:52 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_8024ef0cea996ec159db9512d79643bb + Openai-Processing-Ms: + - '888' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '891' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=OG6k0pEHTzDFHWR2hZstktrhqxb5xEApbyEXWLX4J8w-1764265072-1.0.1.1-0aVXMbN5qlCvwfOOYv.im0ZQN7B3cb4W52hJd2LtyueNQyeuwczOl6AHFZ5gaPajOy28Kn.BYIMKh1RqUcZZm7mEq_1KRSOaywG2rSXPviQ; + path=/; expires=Thu, 27-Nov-25 18:07:52 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=zX24zGeI3V58S_T1gJDemI.15.Ut328syKiyJ8nXK54-1764265072902-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53655af947ba0d-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c70a03881919438f1de10070910", + "object": "eval.run", + "created_at": 1764265072, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6f782c8191900a5cb6be3db61e", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6f782c8191900a5cb6be3db61e?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c70a03881919438f1de10070910", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:52 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml new file mode 100644 index 00000000..ac064fd8 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:51 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_db025b54c4df6310390ca1dd18fd378a + Openai-Processing-Ms: + - '317' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '320' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=udxiDFrwu5WKkGQ9Wvj3P_U_AnoaYhAp9zKt1nDyASU-1764265071-1.0.1.1-r03vlkmGOoO_2v_HrjcqpRTyPZ6rGYjbwYzO.KNtyhCU1Ku4ZRHVSF8mczGOzNlJMakxHihZ3jobRlOUXanSLN18EwHvSiQlZ_b27rr5e.o; + path=/; expires=Thu, 27-Nov-25 18:07:51 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=qk9zb8mfB6JGfHeQNy2ogyookxBtkWpGrh_Uhe_b8zA-1764265071717-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365574b9eb3ae-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6f782c8191900a5cb6be3db61e", + "object": "eval", + "created_at": 1764265071, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-32bb5cf1-b6c0-4030-ba29-10a1dca004ac", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:51 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_retrieve.yml b/spec/fixtures/cassettes/evals_runs_retrieve.yml new file mode 100644 index 00000000..2e930997 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_retrieve.yml @@ -0,0 +1,123 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6a0c488191ac6adc60180c4d03/runs/evalrun_69288c6a90708191b86c7e82b893c846 + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:47 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_46323b933a8c92d4fb5a19503306f7e9 + Openai-Processing-Ms: + - '135' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '138' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=qweBroIdUWsc2kwidjmrBoBjh0AgfmD9HIlYa7dr9U8-1764265067-1.0.1.1-s4MB7.JR0s1NVmhpFZtvnXP.ZjY7LBRRIGftgzUzTdfSaoEYyOXOl5b46LNRRoz_RSSRyhL_aRcnfjC259bzh4VuzydeG3pQrpbPzJ4pImQ; + path=/; expires=Thu, 27-Nov-25 18:07:47 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=tN3KJHPGcEeisrKyUp1v_Qxgl0niwbz0WWiaLrsuvfI-1764265067259-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53653c9a130d8b-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6a90708191b86c7e82b893c846", + "object": "eval.run", + "created_at": 1764265066, + "status": "in_progress", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6a0c488191ac6adc60180c4d03", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6a0c488191ac6adc60180c4d03?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6a90708191b86c7e82b893c846", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:47 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml b/spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml new file mode 100644 index 00000000..82827f0e --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6a0c488191ac6adc60180c4d03/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:46 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_7376dfd4e698e39760ab9f9e861b62ba + Openai-Processing-Ms: + - '430' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '433' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=Ob0N0Dhz8GBj8HxJrxf5mkfWFLAmmMZQhy1uzMJu0ho-1764265066-1.0.1.1-VrQ2qANnuSoglCiTa7t5h0DFmGk93wFj.RDjRQoZio10n7A0..tRymAKWzWbH0LhYlfiCIMRePofe78ZAi4rtW1D.Pno7zI6O3cB2MDW_2U; + path=/; expires=Thu, 27-Nov-25 18:07:46 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=Vtmm6kZeYPTgBS0vw15wIJboU1CXLC3c04o.l7r55W4-1764265066871-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365385d0e89d9-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6a90708191b86c7e82b893c846", + "object": "eval.run", + "created_at": 1764265066, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6a0c488191ac6adc60180c4d03", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6a0c488191ac6adc60180c4d03?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6a90708191b86c7e82b893c846", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:46 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_retrieve_setup.yml b/spec/fixtures/cassettes/evals_runs_retrieve_setup.yml new file mode 100644 index 00000000..ef5954b9 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_retrieve_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:46 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_327229bac7ba039a82b7a1f8e5e9115a + Openai-Processing-Ms: + - '215' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '218' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=_zFDCgf3us1YhS..PbKumYZeV7ao_jczl7G5ViGrC70-1764265066-1.0.1.1-ugnL3IdDBgW7rjNvXjez6DPCmq0TSqhMfUqfmLGMSE.0rtQu9qKKpxPaGTFdthGabPTwXNlOsfcDSAPRKhUGODnxYZwkKwbdQiOu3xwKaDQ; + path=/; expires=Thu, 27-Nov-25 18:07:46 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=hU4FpeK1yL8DXLn7h76pYGgqJNJBbeYLu8_.K2R6ifw-1764265066186-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53653538ffeb1a-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6a0c488191ac6adc60180c4d03", + "object": "eval", + "created_at": 1764265066, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-05bd0d3d-fee4-42ce-a2ad-6c9a8ad3d6a1", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:46 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_update.yml b/spec/fixtures/cassettes/evals_update.yml new file mode 100644 index 00000000..c44a3a0c --- /dev/null +++ b/spec/fixtures/cassettes/evals_update.yml @@ -0,0 +1,276 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c67f948819192f601708f5599d9 + body: + encoding: UTF-8 + string: '{"metadata":{"modified":"true"}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:44 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_76577432037fb0cc3a49b85a478dd4ea + Openai-Processing-Ms: + - '250' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '253' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=LquyUkXNYgQfoMPEMVOLWFRNC_qD2hsiye2hDetrVF8-1764265064-1.0.1.1-9VDp9tH4CN_s_mg8W6Vfs8hkXfGh1fQy4keP3IgG719WIFoAGCpfkiaqn4PbLZHYnPziTRgbhsWHEqDbHepqoAmFdKvAx4jPAb.6gBLYRHk; + path=/; expires=Thu, 27-Nov-25 18:07:44 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=lZ7wtHiLT.0eAO_IUzVM9XwnK4S.DoHF2kk1gOfdPaE-1764265064596-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53652b4a49d85e-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c67f948819192f601708f5599d9", + "object": "eval", + "created_at": 1764265063, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-49de7e11-beb1-436c-92da-bc8611135e48", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": { + "modified": "true" + } + } + recorded_at: Thu, 27 Nov 2025 17:37:44 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_update_setup.yml b/spec/fixtures/cassettes/evals_update_setup.yml new file mode 100644 index 00000000..26fec873 --- /dev/null +++ b/spec/fixtures/cassettes/evals_update_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:44 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_74c5589fb7844e62af40aaf6e3f72341 + Openai-Processing-Ms: + - '681' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '684' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=VzK9NnpeChQPlG5syc9jH_ssnhnA1FT8uXdbcVhOGBs-1764265064-1.0.1.1-fsILAJMqlR.D5mGnLKpZkjt7NNjmn4tH0JS_Ln1ekBktkhX1ALQtvJL0rP9KGDaqARVgN_dT8bW9IpRvpp1ItulJC16qsrGkuW.8wQKwiCw; + path=/; expires=Thu, 27-Nov-25 18:07:44 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=M8ILSssH1N4.yHGKflKGVTVsLIwgAvMibM.p7ZJROas-1764265064110-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365249faaa62f-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c67f948819192f601708f5599d9", + "object": "eval", + "created_at": 1764265063, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-49de7e11-beb1-436c-92da-bc8611135e48", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:44 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/openai/client/evals_spec.rb b/spec/openai/client/evals_spec.rb new file mode 100644 index 00000000..d79289c0 --- /dev/null +++ b/spec/openai/client/evals_spec.rb @@ -0,0 +1,289 @@ +RSpec.describe OpenAI::Client do + describe "#evals" do + let(:eval_params) do + { + name: "Sentiment Analysis", + data_source_config: { + type: "custom", + item_schema: { + type: "object", + properties: { + input: { type: "string" } + }, + required: ["input"] + }, + include_sample_schema: true + }, + testing_criteria: [ + { + type: "label_model", + model: "o3-mini", + input: [ + { role: "developer", + content: "Classify the sentiment of the following statement " \ + "as one of 'positive', 'neutral', or 'negative'" }, + { role: "user", content: "Statement: {{item.input}}" } + ], + passing_labels: ["positive"], + labels: %w[positive neutral negative], + name: "Sentiment grader" + } + ] + } + end + let(:eval_id) do + VCR.use_cassette("#{cassette} setup") do + OpenAI::Client.new.evals.create( + parameters: eval_params + )["id"] + end + end + + let(:run_params) do + { + name: "Run 1", + data_source: { + type: "completions", + input_messages: { + type: "template", + template: [ + { + role: "developer", + content: "You are a helpful assistant." + }, + { + role: "user", + content: "{{item.input}}" + } + ] + }, + model: "gpt-4o-mini", + source: { + type: "file_content", + content: [ + { + item: { + input: "I love this product!", + ground_truth: "positive" + } + } + ] + } + } + } + end + + let(:run_id) do + VCR.use_cassette("#{cassette} run setup") do + OpenAI::Client.new.evals.runs.create( + eval_id: eval_id, + parameters: run_params + )["id"] + end + end + + describe "#retrieve" do + let(:cassette) { "evals retrieve" } + let(:response) { OpenAI::Client.new.evals.retrieve(id: eval_id) } + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval") + expect(response["id"]).to eq(eval_id) + end + end + end + + describe "#create" do + let(:cassette) { "evals create" } + let(:response) do + OpenAI::Client.new.evals.create( + parameters: eval_params + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval") + expect(response["name"]).to eq("Sentiment Analysis") + end + end + end + + describe "#update" do + let(:cassette) { "evals update" } + let(:response) do + OpenAI::Client.new.evals.update( + id: eval_id, + parameters: { metadata: { modified: "true" } } + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval") + end + end + end + describe "#list", :vcr do + let(:cassette) { "evals list" } + let(:response) { OpenAI::Client.new.evals.list } + + before { eval_id } + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("list") + expect(response["data"]).to be_an(Array) + expect(response.dig("data", 0, "object")).to eq("eval") if response["data"].any? + end + end + end + + describe "#runs" do + describe "#list", :vcr do + let(:cassette) { "evals runs list" } + let(:response) { OpenAI::Client.new.evals.runs.list(eval_id: eval_id) } + + before { run_id } + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("list") + expect(response["data"]).to be_an(Array) + expect(response.dig("data", 0, "object")).to eq("eval.run") if response["data"].any? + end + end + end + + describe "#retrieve" do + let(:cassette) { "evals runs retrieve" } + let(:response) do + OpenAI::Client.new.evals.runs.retrieve( + eval_id: eval_id, + id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run") + expect(response["id"]).to eq(run_id) + expect(response["eval_id"]).to eq(eval_id) + end + end + end + + describe "#create" do + let(:cassette) { "evals runs create" } + let(:response) do + OpenAI::Client.new.evals.runs.create( + eval_id: eval_id, + parameters: run_params + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run") + expect(response["eval_id"]).to eq(eval_id) + expect(response["name"]).to eq("Run 1") + end + end + end + + describe "#output_items" do + describe "#list", :vcr do + let(:cassette) { "evals runs output_items list" } + let(:response) do + OpenAI::Client.new.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("list") + expect(response["data"]).to be_an(Array) + end + end + end + + describe "#retrieve" do + let(:cassette) { "evals runs output_items retrieve" } + let(:output_item_id) do + OpenAI::Client.new.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id + )["data"].first["id"] + end + let(:response) do + OpenAI::Client.new.evals.runs.output_items.retrieve( + eval_id: eval_id, + run_id: run_id, + id: output_item_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run.output_item") + expect(response["id"]).to eq(output_item_id) + end + end + end + end + + describe "#cancel" do + let(:cassette) { "evals runs cancel" } + let(:response) do + OpenAI::Client.new.evals.runs.cancel( + eval_id: eval_id, + id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run") + expect(response["status"]).to eq("canceled") + end + end + end + + describe "#delete" do + let(:cassette) { "evals runs delete" } + let(:response) do + OpenAI::Client.new.evals.runs.cancel( + eval_id: eval_id, + id: run_id + ) + + OpenAI::Client.new.evals.runs.delete( + eval_id: eval_id, + id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run.deleted") + end + end + end + end + + describe "#delete" do + let(:cassette) { "evals delete" } + let(:response) do + OpenAI::Client.new.evals.delete(id: eval_id) + end + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.deleted") + end + end + end + end +end