From dc9dde78a9e1a322e4111fb5bcec5e334733f525 Mon Sep 17 00:00:00 2001 From: Juan Arboleda <35846576+alzeck@users.noreply.github.com> Date: Thu, 27 Nov 2025 18:04:56 +0000 Subject: [PATCH 1/5] add support for evals --- lib/openai.rb | 1 + lib/openai/client.rb | 4 + lib/openai/evals.rb | 71 + spec/fixtures/cassettes/evals_create.yml | 277 + spec/fixtures/cassettes/evals_delete.yml | 74 + .../fixtures/cassettes/evals_delete_setup.yml | 277 + spec/fixtures/cassettes/evals_list.yml | 5411 +++++++++++++++++ spec/fixtures/cassettes/evals_list_setup.yml | 277 + spec/fixtures/cassettes/evals_retrieve.yml | 274 + .../cassettes/evals_retrieve_setup.yml | 277 + spec/fixtures/cassettes/evals_runs_cancel.yml | 125 + .../cassettes/evals_runs_cancel_run_setup.yml | 125 + .../cassettes/evals_runs_cancel_setup.yml | 277 + spec/fixtures/cassettes/evals_runs_create.yml | 125 + .../cassettes/evals_runs_create_setup.yml | 277 + spec/fixtures/cassettes/evals_runs_delete.yml | 196 + .../cassettes/evals_runs_delete_run_setup.yml | 125 + .../cassettes/evals_runs_delete_setup.yml | 277 + .../evals_runs_output_items_list.yml | 76 + ...ns_output_items_list_output_item_setup.yml | 76 + ...evals_runs_output_items_list_run_setup.yml | 125 + .../evals_runs_output_items_list_setup.yml | 277 + .../evals_runs_output_items_retrieve.yml | 303 + ...s_runs_output_items_retrieve_run_setup.yml | 125 + ...evals_runs_output_items_retrieve_setup.yml | 277 + .../cassettes/evals_runs_retrieve.yml | 123 + .../evals_runs_retrieve_run_setup.yml | 125 + .../cassettes/evals_runs_retrieve_setup.yml | 277 + spec/fixtures/cassettes/evals_update.yml | 276 + .../fixtures/cassettes/evals_update_setup.yml | 277 + spec/openai/client/evals_spec.rb | 274 + 31 files changed, 11081 insertions(+) create mode 100644 lib/openai/evals.rb create mode 100644 spec/fixtures/cassettes/evals_create.yml create mode 100644 spec/fixtures/cassettes/evals_delete.yml create mode 100644 spec/fixtures/cassettes/evals_delete_setup.yml create mode 100644 spec/fixtures/cassettes/evals_list.yml create mode 100644 spec/fixtures/cassettes/evals_list_setup.yml create mode 100644 spec/fixtures/cassettes/evals_retrieve.yml create mode 100644 spec/fixtures/cassettes/evals_retrieve_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_cancel.yml create mode 100644 spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_cancel_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_create.yml create mode 100644 spec/fixtures/cassettes/evals_runs_create_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_delete.yml create mode 100644 spec/fixtures/cassettes/evals_runs_delete_run_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_delete_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_list.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_retrieve.yml create mode 100644 spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_retrieve_setup.yml create mode 100644 spec/fixtures/cassettes/evals_update.yml create mode 100644 spec/fixtures/cassettes/evals_update_setup.yml create mode 100644 spec/openai/client/evals_spec.rb diff --git a/lib/openai.rb b/lib/openai.rb index d5880c90..9128bfaf 100644 --- a/lib/openai.rb +++ b/lib/openai.rb @@ -22,6 +22,7 @@ require_relative "openai/batches" require_relative "openai/usage" require_relative "openai/conversations" +require_relative "openai/evals" module OpenAI class Error < StandardError; end diff --git a/lib/openai/client.rb b/lib/openai/client.rb index 6054af0f..c9127861 100644 --- a/lib/openai/client.rb +++ b/lib/openai/client.rb @@ -109,6 +109,10 @@ def conversations @conversations ||= OpenAI::Conversations.new(client: self) end + def evals + @evals ||= OpenAI::Evals.new(client: self) + end + def azure? @api_type&.to_sym == :azure end diff --git a/lib/openai/evals.rb b/lib/openai/evals.rb new file mode 100644 index 00000000..bbbbdb95 --- /dev/null +++ b/lib/openai/evals.rb @@ -0,0 +1,71 @@ +module OpenAI + class Evals + def initialize(client:) + @client = client + end + + def create(parameters: {}) + @client.json_post(path: "/evals", parameters: parameters) + end + + def retrieve(id:) + @client.get(path: "/evals/#{id}") + end + + def update(id:, parameters: {}) + @client.json_post(path: "/evals/#{id}", parameters: parameters) + end + + def delete(id:) + @client.delete(path: "/evals/#{id}") + end + + def list(parameters: {}) + @client.get(path: "/evals", parameters: parameters) + end + + def runs + @runs ||= Runs.new(client: @client) + end + + class Runs + def initialize(client:) + @client = client + end + + def create(eval_id:, parameters: {}) + @client.json_post(path: "/evals/#{eval_id}/runs", parameters: parameters) + end + + def retrieve(eval_id:, id:) + @client.get(path: "/evals/#{eval_id}/runs/#{id}") + end + + def cancel(eval_id:, id:) + @client.post(path: "/evals/#{eval_id}/runs/#{id}/cancel") + end + + def delete(eval_id:, id:) + @client.delete(path: "/evals/#{eval_id}/runs/#{id}") + end + + def output_items + @output_items ||= OutputItems.new(client: @client) + end + + class OutputItems + def initialize(client:) + @client = client + end + + def list(eval_id:, run_id:, parameters: {}) + @client.get(path: "/evals/#{eval_id}/runs/#{run_id}/output_items", parameters: parameters) + end + + def retrieve(eval_id:, run_id:, id:) + @client.get(path: "/evals/#{eval_id}/runs/#{run_id}/output_items/#{id}") + end + end + end + end +end diff --git a/spec/fixtures/cassettes/evals_create.yml b/spec/fixtures/cassettes/evals_create.yml new file mode 100644 index 00000000..919537ef --- /dev/null +++ b/spec/fixtures/cassettes/evals_create.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:43 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_ffb92a5fdd9ce2220567f011a43dba2c + Openai-Processing-Ms: + - '555' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '557' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=G4GcoNWM5rT0kajORKFsYNgcXAz8HMsb5rZ8Zzkw_LI-1764265063-1.0.1.1-Dp9PBpRTcl2U.a5OAYevFIabxr3OC6hE7.9O3KIiWz6tB1_9SQ1VMKna9wP6_3b8KkVh3uQtSLsQ0_BXDzRXOfZyUuFUaoabN67UbMP4q64; + path=/; expires=Thu, 27-Nov-25 18:07:43 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=x0rOeErtaRsMGn4UZqdIWs3b9skWEly1bRr442iHDDE-1764265063034-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53651eddc83c84-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c66d7bc8191a3a478e4e5b174f7", + "object": "eval", + "created_at": 1764265062, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-8c5ece96-5e5d-4051-87a1-9af9a668be70", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:43 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_delete.yml b/spec/fixtures/cassettes/evals_delete.yml new file mode 100644 index 00000000..4760d4fd --- /dev/null +++ b/spec/fixtures/cassettes/evals_delete.yml @@ -0,0 +1,74 @@ +--- +http_interactions: +- request: + method: delete + uri: https://api.openai.com/v1/evals/eval_69288c753da081918baccc440dfb3cea + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:57 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_5b2c7c66080f297227416bf472b45866 + Openai-Processing-Ms: + - '267' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '269' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=FDivFwBSqSQaDjz.Sv2hSLAw1c.5lyvVl0nLrxBNXHo-1764265077-1.0.1.1-KkM8oLJRZq5PU2QFlnlZFTTWvrS77ZQkSbdDyJTCxO_5sOqJZ8VB8KXJw4lN0JxBa06O3AaCfEBncyzc78mA7scuegGDTEebpLCiXE4iF3U; + path=/; expires=Thu, 27-Nov-25 18:07:57 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=I7CzEHguum6GIZRgNj0yySn3bEm20mjveBzKgnyKSA0-1764265077911-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53657e4af94d65-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "eval.deleted", + "deleted": true, + "eval_id": "eval_69288c753da081918baccc440dfb3cea" + } + recorded_at: Thu, 27 Nov 2025 17:37:57 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_delete_setup.yml b/spec/fixtures/cassettes/evals_delete_setup.yml new file mode 100644 index 00000000..2afd5f32 --- /dev/null +++ b/spec/fixtures/cassettes/evals_delete_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:57 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_a48649a5da6f373a7dd1c03ad7727a2d + Openai-Processing-Ms: + - '181' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '184' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=gaAv6K1dIpaWENMBQCC5oWFC7WYA15BUV5ufk1SWx_Q-1764265077-1.0.1.1-_Rc0A7giRZmUc2b5hF6pZCpHFLyZscP93icF_6Lp9NDWaJhiafMjpahB2ETzDa9c6vS1QBMbHTvgr04kIRJZpwg7DX8lHQ4Mwbm6d5z6S7E; + path=/; expires=Thu, 27-Nov-25 18:07:57 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=1rBJq8YPuzzVcpl.U_aK0HikFl_72LTQ.QmqttbpqPk-1764265077383-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53657b9e2fcc9e-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c753da081918baccc440dfb3cea", + "object": "eval", + "created_at": 1764265077, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-a4525dea-03ff-497b-bdd4-7d1d1bdbb3d1", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:57 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_list.yml b/spec/fixtures/cassettes/evals_list.yml new file mode 100644 index 00000000..54873fc8 --- /dev/null +++ b/spec/fixtures/cassettes/evals_list.yml @@ -0,0 +1,5411 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:45 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_09908134b92384fc64c2e1a044fc1b8f + Openai-Processing-Ms: + - '349' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '365' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=a9lHm_ooC54LUqxpoUst1vazPY71OQaHdrCBcnTLJ8Y-1764265065-1.0.1.1-PvPVksJlQfOKII3YuqWv76rdDKlmg3Af7t.kcEILylSWKihcvR2SUFc.At3ilkSNU3DxtN1PWnAFTzSeGiuIxObyz7ifqWs2aR6jOudJDM0; + path=/; expires=Thu, 27-Nov-25 18:07:45 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=7IH9dxNsKoIkP3ir_C4yjxfvA8TzZJyRidP6c.KKy7w-1764265065684-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365315dbb97f8-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [ + { + "id": "eval_692886ca71948191a94a46ef2866fa38", + "object": "eval", + "created_at": 1764263626, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-9ec23aee-5784-4532-b734-4eaa1441c1b4", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886c81f0c81918301673fd48074e3", + "object": "eval", + "created_at": 1764263624, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-b605af03-d2b3-4219-aabc-bdbbd02864c3", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886c607dc819198c3b71cbf375088", + "object": "eval", + "created_at": 1764263622, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-5d95bb83-b098-4c8c-9ccc-1f327b7c680f", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886c4402c8191b16daf0a13927d55", + "object": "eval", + "created_at": 1764263620, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-a5f01088-9cd6-484c-b16c-4239b8ce247f", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886c286c08191a59229de96f2519c", + "object": "eval", + "created_at": 1764263618, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-6ab333ba-8b63-4868-ba3d-8444b9f0b788", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886c0c2e88191b175224e13418b17", + "object": "eval", + "created_at": 1764263616, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-ea13e537-3285-4116-87ea-d26232d4e433", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886bf65fc8191991a4848d677e502", + "object": "eval", + "created_at": 1764263615, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-ecffb0f6-e04c-470b-a089-71512d3f7e2e", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886be83748191aa1ecb05c5db958e", + "object": "eval", + "created_at": 1764263614, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-ec2a502e-e03d-4b4e-920f-c9996764023a", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": { + "modified": "true" + } + }, + { + "id": "eval_692886bd90d8819198391114178d4134", + "object": "eval", + "created_at": 1764263613, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-a8d9c70b-5062-40ad-b882-6d7d2b4835d7", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692886bc33dc81919ec1696f9e931ff4", + "object": "eval", + "created_at": 1764263612, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-1bbd590e-ef4f-462e-a050-094b3925482c", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_69288535779c8191a0bed3405c7a72ab", + "object": "eval", + "created_at": 1764263221, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-9f9e21ad-adcd-4f8e-b359-b9aa6a34a974", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692885332d10819180c74603e6462a4c", + "object": "eval", + "created_at": 1764263219, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-3b0cec65-bd94-41a5-a67c-894300044c4a", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_69288530792c819181f87315e2cf7a98", + "object": "eval", + "created_at": 1764263216, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-8d9f1ee8-2ca7-4ee7-9775-84fd81f188ac", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_6928852eacfc8191939c094b7bf768a3", + "object": "eval", + "created_at": 1764263214, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-261fabf9-7bb5-497b-8103-a989a3d5780a", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_6928852cfc9c81918d7e5fee3762e782", + "object": "eval", + "created_at": 1764263212, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-6b2951b4-82b6-4471-b494-2f362485c8ba", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_6928852ae6588191801b3c819b2ec864", + "object": "eval", + "created_at": 1764263210, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-27b6b470-b897-47a2-84a5-bae598fe8475", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_692885294e408191883f13c67b96cc77", + "object": "eval", + "created_at": 1764263209, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-724f7a30-b74b-49d5-8fbc-abe93f386347", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_69288527f2548191a4a1550aa8ae4962", + "object": "eval", + "created_at": 1764263207, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-0abe0176-b62b-4dd6-96a1-3f30623a53f4", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": { + "modified": "true" + } + }, + { + "id": "eval_692885254d4c8191b44f0366019b69c4", + "object": "eval", + "created_at": 1764263205, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input" + ], + "properties": { + "input": { + "type": "string" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-d2a3c749-e56a-425d-8adf-8ec9ae2d2b45", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + }, + { + "id": "eval_69288313e4408191922bf1863f2ba432", + "object": "eval", + "created_at": 1764262675, + "data_source_config": { + "type": "logs", + "max_items": null, + "schema": { + "required": [ + "item", + "sample" + ], + "properties": { + "item": { + "required": [ + "input", + "output" + ], + "title": "LogsItemSchema", + "properties": { + "output": { + "items": { + "required": [ + "model", + "output" + ], + "title": "ResponseInputSample", + "properties": { + "model": { + "title": "Model", + "type": "string" + }, + "output": { + "items": { + "required": [ + "role", + "content" + ], + "title": "ChatMessage", + "properties": { + "tool_call_id": { + "default": null, + "title": "Tool Call Id", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "finish_reason": { + "default": null, + "title": "Finish Reason", + "anyOf": [ + { + "enum": [ + "stop", + "length", + "tool_calls", + "content_filter", + "function_call" + ], + "type": "string" + }, + { + "type": "null" + } + ] + }, + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "required": [ + "text", + "type" + ], + "additionalProperties": true, + "title": "ResponseInputText", + "properties": { + "text": { + "title": "Text", + "type": "string" + }, + "type": { + "const": "input_text", + "title": "Type", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "detail", + "type" + ], + "additionalProperties": true, + "title": "ResponseInputImage", + "properties": { + "file_id": { + "default": null, + "title": "File Id", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "detail": { + "title": "Detail", + "enum": [ + "low", + "high", + "auto" + ], + "type": "string" + }, + "type": { + "const": "input_image", + "title": "Type", + "type": "string" + }, + "image_url": { + "default": null, + "title": "Image Url", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "type": "object" + }, + { + "required": [ + "annotations", + "text", + "type" + ], + "additionalProperties": true, + "title": "ResponseOutputText", + "properties": { + "text": { + "title": "Text", + "type": "string" + }, + "type": { + "const": "output_text", + "title": "Type", + "type": "string" + }, + "logprobs": { + "default": null, + "title": "Logprobs", + "anyOf": [ + { + "items": { + "required": [ + "token", + "bytes", + "logprob", + "top_logprobs" + ], + "additionalProperties": true, + "title": "Logprob", + "properties": { + "bytes": { + "items": { + "type": "integer" + }, + "title": "Bytes", + "type": "array" + }, + "token": { + "title": "Token", + "type": "string" + }, + "top_logprobs": { + "items": { + "required": [ + "token", + "bytes", + "logprob" + ], + "additionalProperties": true, + "title": "LogprobTopLogprob", + "properties": { + "bytes": { + "items": { + "type": "integer" + }, + "title": "Bytes", + "type": "array" + }, + "token": { + "title": "Token", + "type": "string" + }, + "logprob": { + "title": "Logprob", + "type": "number" + } + }, + "type": "object" + }, + "title": "Top Logprobs", + "type": "array" + }, + "logprob": { + "title": "Logprob", + "type": "number" + } + }, + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ] + }, + "annotations": { + "items": { + "anyOf": [ + { + "required": [ + "file_id", + "filename", + "index", + "type" + ], + "additionalProperties": true, + "title": "AnnotationFileCitation", + "properties": { + "file_id": { + "title": "File Id", + "type": "string" + }, + "index": { + "title": "Index", + "type": "integer" + }, + "type": { + "const": "file_citation", + "title": "Type", + "type": "string" + }, + "filename": { + "title": "Filename", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "end_index", + "start_index", + "title", + "type", + "url" + ], + "additionalProperties": true, + "title": "AnnotationURLCitation", + "properties": { + "start_index": { + "title": "Start Index", + "type": "integer" + }, + "end_index": { + "title": "End Index", + "type": "integer" + }, + "title": { + "title": "Title", + "type": "string" + }, + "type": { + "const": "url_citation", + "title": "Type", + "type": "string" + }, + "url": { + "title": "Url", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "container_id", + "end_index", + "file_id", + "filename", + "start_index", + "type" + ], + "additionalProperties": true, + "title": "AnnotationContainerFileCitation", + "properties": { + "start_index": { + "title": "Start Index", + "type": "integer" + }, + "end_index": { + "title": "End Index", + "type": "integer" + }, + "type": { + "const": "container_file_citation", + "title": "Type", + "type": "string" + }, + "filename": { + "title": "Filename", + "type": "string" + }, + "file_id": { + "title": "File Id", + "type": "string" + }, + "container_id": { + "title": "Container Id", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "file_id", + "index", + "type" + ], + "additionalProperties": true, + "title": "AnnotationFilePath", + "properties": { + "file_id": { + "title": "File Id", + "type": "string" + }, + "index": { + "title": "Index", + "type": "integer" + }, + "type": { + "const": "file_path", + "title": "Type", + "type": "string" + } + }, + "type": "object" + } + ] + }, + "title": "Annotations", + "type": "array" + } + }, + "type": "object" + }, + { + "required": [ + "type", + "input_audio" + ], + "title": "ResponseInputAudio", + "properties": { + "type": { + "const": "input_audio", + "title": "Type", + "type": "string" + }, + "input_audio": { + "required": [ + "data" + ], + "title": "AudioData", + "properties": { + "data": { + "title": "Data", + "type": "string" + }, + "format": { + "default": "wav", + "title": "Format", + "enum": [ + "wav", + "mp3" + ], + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + }, + { + "required": [ + "type", + "output_audio" + ], + "title": "ResponseOutputAudio", + "properties": { + "audio_transcript": { + "default": null, + "title": "Audio Transcript", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "output_audio": { + "required": [ + "data" + ], + "title": "AudioData", + "properties": { + "data": { + "title": "Data", + "type": "string" + }, + "format": { + "default": "wav", + "title": "Format", + "enum": [ + "wav", + "mp3" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "output_audio", + "title": "Type", + "type": "string" + } + }, + "type": "object" + } + ] + }, + "type": "array" + } + ] + }, + "role": { + "title": "Role", + "enum": [ + "system", + "user", + "assistant", + "developer", + "tool", + "function" + ], + "type": "string" + }, + "tool_calls": { + "default": null, + "title": "Tool Calls", + "anyOf": [ + { + "items": { + "required": [ + "type", + "function", + "id" + ], + "title": "FunctionCall", + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "title": "Function", + "properties": { + "return_value": { + "default": null, + "title": "Return Value", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "name": { + "title": "Name", + "type": "string" + }, + "arguments": { + "title": "Arguments", + "type": "string" + } + }, + "type": "object" + }, + "id": { + "title": "Id", + "type": "string" + }, + "type": { + "const": "function", + "title": "Type", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ] + }, + "reasoning_summary": { + "default": null, + "title": "Reasoning Summary", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "name": { + "default": null, + "title": "Name", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "function_call": { + "default": null, + "anyOf": [ + { + "required": [ + "name", + "arguments" + ], + "title": "Function", + "properties": { + "return_value": { + "default": null, + "title": "Return Value", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "name": { + "title": "Name", + "type": "string" + }, + "arguments": { + "title": "Arguments", + "type": "string" + } + }, + "type": "object" + }, + { + "type": "null" + } + ] + }, + "refusal": { + "default": null, + "title": "Refusal", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "trace_id": { + "default": null, + "title": "Trace Id", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "type": "object" + }, + "title": "Output", + "type": "array" + } + }, + "type": "object" + }, + "title": "Output", + "type": "array" + }, + "input": { + "items": { + "required": [ + "role", + "content" + ], + "title": "ChatMessage", + "properties": { + "tool_call_id": { + "default": null, + "title": "Tool Call Id", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "finish_reason": { + "default": null, + "title": "Finish Reason", + "anyOf": [ + { + "enum": [ + "stop", + "length", + "tool_calls", + "content_filter", + "function_call" + ], + "type": "string" + }, + { + "type": "null" + } + ] + }, + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "required": [ + "text", + "type" + ], + "additionalProperties": true, + "title": "ResponseInputText", + "properties": { + "text": { + "title": "Text", + "type": "string" + }, + "type": { + "const": "input_text", + "title": "Type", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "detail", + "type" + ], + "additionalProperties": true, + "title": "ResponseInputImage", + "properties": { + "file_id": { + "default": null, + "title": "File Id", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "detail": { + "title": "Detail", + "enum": [ + "low", + "high", + "auto" + ], + "type": "string" + }, + "type": { + "const": "input_image", + "title": "Type", + "type": "string" + }, + "image_url": { + "default": null, + "title": "Image Url", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "type": "object" + }, + { + "required": [ + "annotations", + "text", + "type" + ], + "additionalProperties": true, + "title": "ResponseOutputText", + "properties": { + "text": { + "title": "Text", + "type": "string" + }, + "type": { + "const": "output_text", + "title": "Type", + "type": "string" + }, + "logprobs": { + "default": null, + "title": "Logprobs", + "anyOf": [ + { + "items": { + "required": [ + "token", + "bytes", + "logprob", + "top_logprobs" + ], + "additionalProperties": true, + "title": "Logprob", + "properties": { + "bytes": { + "items": { + "type": "integer" + }, + "title": "Bytes", + "type": "array" + }, + "token": { + "title": "Token", + "type": "string" + }, + "top_logprobs": { + "items": { + "required": [ + "token", + "bytes", + "logprob" + ], + "additionalProperties": true, + "title": "LogprobTopLogprob", + "properties": { + "bytes": { + "items": { + "type": "integer" + }, + "title": "Bytes", + "type": "array" + }, + "token": { + "title": "Token", + "type": "string" + }, + "logprob": { + "title": "Logprob", + "type": "number" + } + }, + "type": "object" + }, + "title": "Top Logprobs", + "type": "array" + }, + "logprob": { + "title": "Logprob", + "type": "number" + } + }, + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ] + }, + "annotations": { + "items": { + "anyOf": [ + { + "required": [ + "file_id", + "filename", + "index", + "type" + ], + "additionalProperties": true, + "title": "AnnotationFileCitation", + "properties": { + "file_id": { + "title": "File Id", + "type": "string" + }, + "index": { + "title": "Index", + "type": "integer" + }, + "type": { + "const": "file_citation", + "title": "Type", + "type": "string" + }, + "filename": { + "title": "Filename", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "end_index", + "start_index", + "title", + "type", + "url" + ], + "additionalProperties": true, + "title": "AnnotationURLCitation", + "properties": { + "start_index": { + "title": "Start Index", + "type": "integer" + }, + "end_index": { + "title": "End Index", + "type": "integer" + }, + "title": { + "title": "Title", + "type": "string" + }, + "type": { + "const": "url_citation", + "title": "Type", + "type": "string" + }, + "url": { + "title": "Url", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "container_id", + "end_index", + "file_id", + "filename", + "start_index", + "type" + ], + "additionalProperties": true, + "title": "AnnotationContainerFileCitation", + "properties": { + "start_index": { + "title": "Start Index", + "type": "integer" + }, + "end_index": { + "title": "End Index", + "type": "integer" + }, + "type": { + "const": "container_file_citation", + "title": "Type", + "type": "string" + }, + "filename": { + "title": "Filename", + "type": "string" + }, + "file_id": { + "title": "File Id", + "type": "string" + }, + "container_id": { + "title": "Container Id", + "type": "string" + } + }, + "type": "object" + }, + { + "required": [ + "file_id", + "index", + "type" + ], + "additionalProperties": true, + "title": "AnnotationFilePath", + "properties": { + "file_id": { + "title": "File Id", + "type": "string" + }, + "index": { + "title": "Index", + "type": "integer" + }, + "type": { + "const": "file_path", + "title": "Type", + "type": "string" + } + }, + "type": "object" + } + ] + }, + "title": "Annotations", + "type": "array" + } + }, + "type": "object" + }, + { + "required": [ + "type", + "input_audio" + ], + "title": "ResponseInputAudio", + "properties": { + "type": { + "const": "input_audio", + "title": "Type", + "type": "string" + }, + "input_audio": { + "required": [ + "data" + ], + "title": "AudioData", + "properties": { + "data": { + "title": "Data", + "type": "string" + }, + "format": { + "default": "wav", + "title": "Format", + "enum": [ + "wav", + "mp3" + ], + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + }, + { + "required": [ + "type", + "output_audio" + ], + "title": "ResponseOutputAudio", + "properties": { + "audio_transcript": { + "default": null, + "title": "Audio Transcript", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "output_audio": { + "required": [ + "data" + ], + "title": "AudioData", + "properties": { + "data": { + "title": "Data", + "type": "string" + }, + "format": { + "default": "wav", + "title": "Format", + "enum": [ + "wav", + "mp3" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": { + "const": "output_audio", + "title": "Type", + "type": "string" + } + }, + "type": "object" + } + ] + }, + "type": "array" + } + ] + }, + "role": { + "title": "Role", + "enum": [ + "system", + "user", + "assistant", + "developer", + "tool", + "function" + ], + "type": "string" + }, + "tool_calls": { + "default": null, + "title": "Tool Calls", + "anyOf": [ + { + "items": { + "required": [ + "type", + "function", + "id" + ], + "title": "FunctionCall", + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "title": "Function", + "properties": { + "return_value": { + "default": null, + "title": "Return Value", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "name": { + "title": "Name", + "type": "string" + }, + "arguments": { + "title": "Arguments", + "type": "string" + } + }, + "type": "object" + }, + "id": { + "title": "Id", + "type": "string" + }, + "type": { + "const": "function", + "title": "Type", + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ] + }, + "reasoning_summary": { + "default": null, + "title": "Reasoning Summary", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "name": { + "default": null, + "title": "Name", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "function_call": { + "default": null, + "anyOf": [ + { + "required": [ + "name", + "arguments" + ], + "title": "Function", + "properties": { + "return_value": { + "default": null, + "title": "Return Value", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "name": { + "title": "Name", + "type": "string" + }, + "arguments": { + "title": "Arguments", + "type": "string" + } + }, + "type": "object" + }, + { + "type": "null" + } + ] + }, + "refusal": { + "default": null, + "title": "Refusal", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "trace_id": { + "default": null, + "title": "Trace Id", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ] + } + }, + "type": "object" + }, + "title": "Input", + "type": "array" + } + }, + "type": "object" + }, + "sample": { + "required": [ + "model", + "choices" + ], + "properties": { + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "output_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "model": { + "type": "string" + }, + "input_tools": { + "items": { + "type": "object" + }, + "type": "array" + }, + "output_json": { + "type": "object" + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "choices": { + "items": { + "required": [ + "index", + "message", + "finish_reason" + ], + "properties": { + "message": { + "required": [ + "role" + ], + "properties": { + "role": { + "enum": [ + "assistant" + ], + "type": "string" + }, + "function_call": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": [ + "object", + "null" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "tool_calls": { + "items": { + "required": [ + "type", + "function", + "id" + ], + "properties": { + "function": { + "required": [ + "name", + "arguments" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "type": "object" + }, + "id": { + "type": "string" + }, + "type": { + "enum": [ + "function" + ], + "type": "string" + } + }, + "type": "object" + }, + "type": [ + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + } + }, + "type": "object" + }, + "finish_reason": { + "type": "string" + } + }, + "type": "object" + }, + "type": "array" + }, + "output_text": { + "type": "string" + } + }, + "type": "object" + } + }, + "type": "object" + }, + "metadata": { + "usecase": "chatbot" + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-8dee6df0-c9c3-4d4f-b500-c21b54aab4c9", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + ], + "first_id": "eval_692886ca71948191a94a46ef2866fa38", + "has_more": true, + "last_id": "eval_69288313e4408191922bf1863f2ba432" + } + recorded_at: Thu, 27 Nov 2025 17:37:45 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_list_setup.yml b/spec/fixtures/cassettes/evals_list_setup.yml new file mode 100644 index 00000000..8d4d6769 --- /dev/null +++ b/spec/fixtures/cassettes/evals_list_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:45 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_ad59a4421109d03bf16f0dd5d568cb43 + Openai-Processing-Ms: + - '187' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '189' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=2URRA0QzprNNuNe_uY_bAL3M.w.SeU2zUU2uu7Rgmz0-1764265065-1.0.1.1-vCfd9v8Qx1oKdvy8E2piuOxlGUK3rPotuNjTxl6IQYJ5WVaFqosWj4aLGxWLszSIoI0n04TnlJXIP.7QK4mv0e4P5vPOsuKUcXtA8Li09Yc; + path=/; expires=Thu, 27-Nov-25 18:07:45 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=YlC8_GZoyq1p_SeriLDaxuZ8sWYRFXIDpqQJzx6Rz.4-1764265065064-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53652e5cc9b11c-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c68ed50819185ed5845facf0be6", + "object": "eval", + "created_at": 1764265064, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-cd6a1097-7691-4502-a5f0-49c7e0043428", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:45 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_retrieve.yml b/spec/fixtures/cassettes/evals_retrieve.yml new file mode 100644 index 00000000..aef193a5 --- /dev/null +++ b/spec/fixtures/cassettes/evals_retrieve.yml @@ -0,0 +1,274 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c65203881918b2f062b30cd7aa9 + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:42 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_34dbc89ee9874a57aa011f2e89ff09c1 + Openai-Processing-Ms: + - '64' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '67' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=u9ZriP.qwIAjZx8EdcGA_lIdQSy3JoUCEUCsrPXIkpA-1764265062-1.0.1.1-LNP5FQOOpKBdaIc3C2ccuYy.iL1pp_bMijvzoBpBvlWXNbP0g8wxMdoCOzPwpuOjSF8v3iva6oKWgOm89yNOz6qPnPaZSgHd9uPKbeJYKP4; + path=/; expires=Thu, 27-Nov-25 18:07:42 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=VCPQsuc9dpzChvitHps0wIlQBQgFbIHfaNT3JZ0f3XA-1764265062088-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365199dad35c5-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c65203881918b2f062b30cd7aa9", + "object": "eval", + "created_at": 1764265061, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-4e90c6d8-f26c-4cb4-a339-41b1b1bb0532", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:42 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_retrieve_setup.yml b/spec/fixtures/cassettes/evals_retrieve_setup.yml new file mode 100644 index 00000000..af74513e --- /dev/null +++ b/spec/fixtures/cassettes/evals_retrieve_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:41 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_93497c36b5e84c57ad0a8e3d9de98b23 + Openai-Processing-Ms: + - '560' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '563' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=pLXAPhTauwRvA5Vu6GbOt1q0_W6MjmZflR3Trnvy8Z4-1764265061-1.0.1.1-N39qqUPxahcowOUZhs4ckWv3aC_u8dg2t9h8L492QtXIcewUsoTNAwIACQ6T8z8JFW6AWgAWg0AfaTZC5aEnahw312LxhY4ItvhkktWXlxM; + path=/; expires=Thu, 27-Nov-25 18:07:41 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=GzrCuuyexD.2XVBCWz_QtDE76Mqhxn7C3kVHuxbsHNY-1764265061277-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365124ca72216-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c65203881918b2f062b30cd7aa9", + "object": "eval", + "created_at": 1764265061, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-4e90c6d8-f26c-4cb4-a339-41b1b1bb0532", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:41 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_cancel.yml b/spec/fixtures/cassettes/evals_runs_cancel.yml new file mode 100644 index 00000000..7510d981 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_cancel.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c7132e48191ad7ff46c32cf1c46/runs/evalrun_69288c71b74c819183f7e7ed01b4d5ff/cancel + body: + encoding: UTF-8 + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Content-Length: + - '0' + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:54 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_9ce39fdd8e538dc9a6878bf4d0daa643 + Openai-Processing-Ms: + - '670' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '672' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=4b4Q3YWwvFU9VANtkY3qjcqR_gDeiqYZCwJIyxsrEjU-1764265074-1.0.1.1-ymldn.MFe8F6uLfJ4vp5MhutNsXSjTy.gBbXOCzxLpiVbDIzmkPqtJS6g8.yD86fizxh3Qf.31nb7jxqc0v62ntUCa8wtPt2laRT4wtO7SE; + path=/; expires=Thu, 27-Nov-25 18:07:54 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=kVuspCEagYsUYNL.F9tTWd5XOiXGUoSC6cFPRSIZe.s-1764265074927-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365693b274f47-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "object": "eval.run", + "created_at": 1764265073, + "status": "canceled", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c7132e48191ad7ff46c32cf1c46", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c7132e48191ad7ff46c32cf1c46?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:54 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml b/spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml new file mode 100644 index 00000000..dba503e1 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_cancel_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c7132e48191ad7ff46c32cf1c46/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:54 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_0093cd2118771c8e402a60b5407cc55a + Openai-Processing-Ms: + - '406' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '408' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=uM0mV1hkYyLOKIK2GYHYsHFckJcIb2swTCF8MGpNWRQ-1764265074-1.0.1.1-MVHCmdcMYkqpRZoXUHf1Xaqx4uSf2gOzKUZef1C2cA11076.c5EnrNdYpkEC9Plv.aSuHmwDcU8ymp8rmkRfLcdBBpKEldY1LB4VhoKgS1A; + path=/; expires=Thu, 27-Nov-25 18:07:54 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=4CiPhk45QtL_dsOtL13kGKXPphdXg2Qyj1NKAWiyHTI-1764265074004-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536564fcdcb0fb-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "object": "eval.run", + "created_at": 1764265073, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c7132e48191ad7ff46c32cf1c46", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c7132e48191ad7ff46c32cf1c46?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c71b74c819183f7e7ed01b4d5ff", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:53 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_cancel_setup.yml b/spec/fixtures/cassettes/evals_runs_cancel_setup.yml new file mode 100644 index 00000000..a25921ad --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_cancel_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:53 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_95082c9c904d2480498f6f2999b250ac + Openai-Processing-Ms: + - '178' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '181' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=a7Te_D7MzgN1F9UQdvVgcdF66cnb72ZVkd7UpZRWGEE-1764265073-1.0.1.1-eYABxh1jILbTiLG6XrF.sWnR_vYqKaf7DPw91z9BFU_NJHkWacIIj5NEgutT3e0Rn.2VhJ1TXNP3vkFcu6rE1cqQsrOp_OwWQ6.JR4IOpdU; + path=/; expires=Thu, 27-Nov-25 18:07:53 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=.Ivd8N1lBfEG2ehGEYjU_vQM.irXnKU.iXYfQEgtkqU-1764265073337-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365625eb4a935-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c7132e48191ad7ff46c32cf1c46", + "object": "eval", + "created_at": 1764265073, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-27eb57ce-63cc-42fb-aa8b-6f3ab2cf73b8", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:53 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_create.yml b/spec/fixtures/cassettes/evals_runs_create.yml new file mode 100644 index 00000000..906b6003 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_create.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6ba2348191be05c2d02d0e0c0b/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:48 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_a227f8a26a2131d8473e1c92074eec46 + Openai-Processing-Ms: + - '760' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '763' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=9ZswAkI3q8s5c6ThS5HjOpifSgedQWKidh4TOo7iv3w-1764265068-1.0.1.1-5Yims_ZHAyjNs7xrqmoYvMK3buvL3vGW1YIhz5oCF.FUvl1TnFPMg4zlbLZ7ScgFbFNlFTrwtzN8lCVxXd3.SjsaGt3sI00xabiVGrQdZfE; + path=/; expires=Thu, 27-Nov-25 18:07:48 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=WZSrX35UqN8Z.mHJ69EsSjq5J3..4kRCvd91oQHmxCI-1764265068789-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365422eff2210-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6c25fc8191ac632b505a3ff1c9", + "object": "eval.run", + "created_at": 1764265068, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6ba2348191be05c2d02d0e0c0b", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6ba2348191be05c2d02d0e0c0b?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6c25fc8191ac632b505a3ff1c9", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:48 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_create_setup.yml b/spec/fixtures/cassettes/evals_runs_create_setup.yml new file mode 100644 index 00000000..a866fb21 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_create_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:47 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_96f911be32e1840672acd68fd0dea03f + Openai-Processing-Ms: + - '234' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '237' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=iSICdk.EdIDyle.RQysHEXmyES4qOD6qYZrmaELQLDA-1764265067-1.0.1.1-_E3nGyLYoebWy.Ed9MMjOgHi28ZepbGSTPvdl7w161O2ex_rTGDSlvf2p5c_uOnYMNQRWXYJ.HxGByHkU4miSAhurDxSKB.6hDN8ZIOX_mg; + path=/; expires=Thu, 27-Nov-25 18:07:47 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=vOcml4pSfdEN84KBG_RBgnPJ2nrh14amro7Q1K0d4xs-1764265067769-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53653f0c150c19-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6ba2348191be05c2d02d0e0c0b", + "object": "eval", + "created_at": 1764265067, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-9474cccc-047e-4ddf-bb6f-b983ca6aef0d", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:47 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_delete.yml b/spec/fixtures/cassettes/evals_runs_delete.yml new file mode 100644 index 00000000..45012dc0 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_delete.yml @@ -0,0 +1,196 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288d369be48191a9dd469d6cb22c9f/runs/evalrun_69288d37912c81919e794ac84108363b/cancel + body: + encoding: UTF-8 + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Content-Length: + - '0' + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:41:13 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_0e5f5722ba2a4e819f366b5b08f03d05 + Openai-Processing-Ms: + - '555' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '558' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=HUQa2g4EML8HJ0fSYEyP8Y1LvFWAs0vMtx_2K2jXggA-1764265273-1.0.1.1-wLir27KcO3TJiEsJtTgjKpw02lSnGmQNyqlkZ4ljpvWaIsI5f4TrbxqT2Fi9klEzlfvGbpNyTaVJjcazBJ0Cu5Ezr_yl_Rns_F_9yxrGoFM; + path=/; expires=Thu, 27-Nov-25 18:11:13 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=VPa8xE7AHfpB9ZpDZZ8Tb0x8BiBplf_CE5ylsOX4FZE-1764265273069-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a403929eb17-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288d37912c81919e794ac84108363b", + "object": "eval.run", + "created_at": 1764265271, + "status": "canceled", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288d369be48191a9dd469d6cb22c9f", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288d369be48191a9dd469d6cb22c9f?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288d37912c81919e794ac84108363b", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:41:13 GMT +- request: + method: delete + uri: https://api.openai.com/v1/evals/eval_69288d369be48191a9dd469d6cb22c9f/runs/evalrun_69288d37912c81919e794ac84108363b + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:41:14 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_f31add9c41781b9ef7f694152f59dadf + Openai-Processing-Ms: + - '763' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '767' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=f9nvHlPi3UV7AK.TJumxczUKk1dIQDhExsuDFexiC.o-1764265274-1.0.1.1-GRqhzByoSCF1KKz1tZXP23mtv9XboeAZfYge3kRkZ2xpVGDqAuxUOpqK8oxTahqMEYXBH1NvIDolFuhNGFmlJsbU2s_qGdYlnjxe5odg_Ss; + path=/; expires=Thu, 27-Nov-25 18:11:14 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=kt_WqBTaaNSKPkhG35J4JsD8KAwC3rcdQEFaxw2b_do-1764265274323-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a454a12b140-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "eval.run.deleted", + "deleted": true, + "run_id": "evalrun_69288d37912c81919e794ac84108363b" + } + recorded_at: Thu, 27 Nov 2025 17:41:14 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_delete_run_setup.yml b/spec/fixtures/cassettes/evals_runs_delete_run_setup.yml new file mode 100644 index 00000000..827634fd --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_delete_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288d369be48191a9dd469d6cb22c9f/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:41:12 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_992f53ec833344ac9d252a854242aad9 + Openai-Processing-Ms: + - '1074' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '1077' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=pDzZar7eZ1cWdnVwvzRNA5pYoyYTAiFvoayFkrLokkA-1764265272-1.0.1.1-kU3SDYb6dj0Qzau6KPvdiPo5Z47dqdwFaMyhjpDaOQTtoNYKjMiuP.KEH.2iKgH43P.beXjXno_HWnkeoNJl5zm4vVXQT15LoUKrnGq55YE; + path=/; expires=Thu, 27-Nov-25 18:11:12 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=ptb8p7NZ79WAj3ZgZcOYPNFxCtjgfJHstwiuvU1ZPrw-1764265272257-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a36bf98a935-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288d37912c81919e794ac84108363b", + "object": "eval.run", + "created_at": 1764265271, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288d369be48191a9dd469d6cb22c9f", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288d369be48191a9dd469d6cb22c9f?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288d37912c81919e794ac84108363b", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:41:12 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_delete_setup.yml b/spec/fixtures/cassettes/evals_runs_delete_setup.yml new file mode 100644 index 00000000..88d8195d --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_delete_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:41:10 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_e6cd656f098a4c5383491d45df4c6bdc + Openai-Processing-Ms: + - '210' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '213' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=eoMCgwTjmGa8TpChmbY35gYtGNsdnvJDG9Oy_HucA3A-1764265270-1.0.1.1-nu7guHa3w50bOYH9x7yVao20MEVGw4A6WC6gnPKshlJDGsawMg9XLgkgmeG0jhE.kQIM6LLGI_Dsm56wwalEAdBPK5Y9RzVtmPsZ.3CjgDo; + path=/; expires=Thu, 27-Nov-25 18:11:10 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=wvlyhu5ZmXw1nXW0.HaQuloGBXKaYEUGpYRztDTf63U-1764265270748-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a536a330a47b367-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288d369be48191a9dd469d6cb22c9f", + "object": "eval", + "created_at": 1764265270, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-44de3f73-0711-4426-8b59-cb26720e53f5", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:41:10 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list.yml b/spec/fixtures/cassettes/evals_runs_output_items_list.yml new file mode 100644 index 00000000..c0e38c0d --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list.yml @@ -0,0 +1,76 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6d54f48191bb18d415547ff09c/runs/evalrun_69288c6e2dac819181b017027cd1d2ba/output_items + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:51 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_044e8ae19b15a7101a63c51d97527437 + Openai-Processing-Ms: + - '380' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '382' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=Ij1_ZQwKnjbo57fg2wgjLL6wo6ykZnDO32Ca0ushv18-1764265071-1.0.1.1-b0Zelat_yRyQHGzvXuBH7A62cjICDnwr8.KXH1ZNQ8nhk9qYmI4U3xE.6e6CojNe5CAODZVFAWymziugL4BG96hgHnpMUmgZzkh.0aCpR9M; + path=/; expires=Thu, 27-Nov-25 18:07:51 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=5kFavy3PInCs8S45ZxHlLdbcTlGBpxX80ap0mbCmaGc-1764265071138-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365532c7ab128-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [], + "first_id": null, + "has_more": false, + "last_id": null + } + recorded_at: Thu, 27 Nov 2025 17:37:51 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml new file mode 100644 index 00000000..4c4d5055 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list_output_item_setup.yml @@ -0,0 +1,76 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_692886c4402c8191b16daf0a13927d55/runs/evalrun_692886c4cb7c8191a6d061543b6c0224/output_items + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:13:41 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_03cd0ef2eb6148bb9db8a92286faa23a + Openai-Processing-Ms: + - '224' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '228' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=6MUOh_RSBEwcm8ZA61n6mZa9_Wlfc.sKgtkfiFWcBjA-1764263621-1.0.1.1-itCbxTpT_mHZQI1ZXkaSUhKMXrMSXzYfYyQQ6PmQ56wgvn7uZ862F8GJ9vlxbxvngqovWvP8hgIFeb_iO5RRWbraUfIQSd4DVNPiQtwzIVI; + path=/; expires=Thu, 27-Nov-25 17:43:41 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=jphYlrigl5iW9Plhyq3CwLH4hYQY1Qp8MYaUJmIgoqc-1764263621721-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5341f08cb6220e-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [], + "first_id": null, + "has_more": false, + "last_id": null + } + recorded_at: Thu, 27 Nov 2025 17:13:41 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml new file mode 100644 index 00000000..d75a849b --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6d54f48191bb18d415547ff09c/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:50 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_88e93be9ef8d9e6a97cc9becd3e56a22 + Openai-Processing-Ms: + - '441' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '444' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=yn01YpoTluz_g0dCfOJu029HxxIE0YxtsV_IDxPV0cA-1764265070-1.0.1.1-RGZeYvFOEMNrnyrsOeT2CRd0vOtSqedJVlTGy7CyAvS2Gc9xzTqqGHOlPIQnWUpK77v2rogZPxT1htz3zdV7c0ACVCmsva90aIUPOPh8sh0; + path=/; expires=Thu, 27-Nov-25 18:07:50 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=zo1jRkukrqcGsIqmCJTtvx_F..hdTyvBGAcHj4x.aEo-1764265070481-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53654ceffb2196-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6e2dac819181b017027cd1d2ba", + "object": "eval.run", + "created_at": 1764265070, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6d54f48191bb18d415547ff09c", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6d54f48191bb18d415547ff09c?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6e2dac819181b017027cd1d2ba", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:50 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml new file mode 100644 index 00000000..c1a0cf44 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_list_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:49 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_900410bfa9674eb2bb47cbf185769adc + Openai-Processing-Ms: + - '184' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '187' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=HULhDfLfjW5mhRIsax.joyijyBBd7EmO2Q3O_IVs7sE-1764265069-1.0.1.1-U_UGCd9.7V5Vj1C.0tk_ra2ZeJ8wT6gajpUlS1MtflDyxwh.9EhYL1aVQCE7pMrSuLNCSyFtkGCltY2l.WqoKyBI0udBVF18gFbumUpDkZk; + path=/; expires=Thu, 27-Nov-25 18:07:49 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=xXc2dl6GdscUZ1VH5HMyd84ZVjDXG6GEdzqczQKDTQA-1764265069472-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365488bf0cca5-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6d54f48191bb18d415547ff09c", + "object": "eval", + "created_at": 1764265069, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-abb2f7c9-703e-4ea7-93fa-a96fa580dcd4", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:49 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml b/spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml new file mode 100644 index 00000000..224a38d7 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_retrieve.yml @@ -0,0 +1,303 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6f782c8191900a5cb6be3db61e/runs/evalrun_69288c70a03881919438f1de10070910/output_items + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:57:40 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_63f77af99d92f4ad3817b3439f12de1c + Openai-Processing-Ms: + - '386' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '388' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=3KuybbQ5ivkgrdoN8P1j2dPObVied6ifNmVh4kUsZbQ-1764266260-1.0.1.1-yfOh7gnUJL3FHVOt4IEgFaKUZldHA2wK6vJofU4dU.x11w9ng8sRVPOrbK4ASfW2RI3L86PErGFm7tv3qy9VUza6CNSyWLgJxcSynLbK9z0; + path=/; expires=Thu, 27-Nov-25 18:27:40 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=jA56LhejCounC.ZFpgofvJ2Nk9OTb5HVXFeLiqACL1U-1764266260747-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53825d6a22eb1d-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [ + { + "id": "outputitem_69288c7485688191b3f81ee02a17a2b8", + "object": "eval.run.output_item", + "created_at": 1764265076, + "status": "pass", + "_datasource_item_content_hash": "07bd0d39b771a2e3976c536264799dc3f2b6e5e943a8d68dc3058bac176de445", + "available_includes": [], + "datasource_item": { + "input": "I love this product!", + "ground_truth": "positive" + }, + "datasource_item_id": 0, + "eval_id": "eval_69288c6f782c8191900a5cb6be3db61e", + "results": [ + { + "name": "Sentiment grader-32bb5cf1-b6c0-4030-ba29-10a1dca004ac", + "score": 1.0, + "passed": true, + "sample": { + "input": [ + { + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "role": "user", + "content": "Statement: I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "{\n \"steps\": [\n {\n \"description\": \"The statement 'I love this product!' includes the word 'love,' indicating a strong positive emotion towards the product.\",\n \"conclusion\": \"The sentiment is positive.\"\n }\n ],\n \"result\": \"positive\"\n}" + } + ], + "finish_reason": "stop", + "model": "o3-mini-2025-01-31", + "usage": { + "total_tokens": 287, + "completion_tokens": 137, + "prompt_tokens": 150, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + ], + "run_id": "evalrun_69288c70a03881919438f1de10070910", + "sample": { + "input": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "That's great to hear! What product are you referring to? I'd love to know more about it and what you enjoy about it!" + } + ], + "finish_reason": "stop", + "model": "gpt-4o-mini-2024-07-18", + "usage": { + "total_tokens": 48, + "completion_tokens": 26, + "prompt_tokens": 22, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + ], + "first_id": "outputitem_69288c7485688191b3f81ee02a17a2b8", + "has_more": false, + "last_id": "outputitem_69288c7485688191b3f81ee02a17a2b8" + } + recorded_at: Thu, 27 Nov 2025 17:57:40 GMT +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6f782c8191900a5cb6be3db61e/runs/evalrun_69288c70a03881919438f1de10070910/output_items/outputitem_69288c7485688191b3f81ee02a17a2b8 + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:57:41 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_96ab283a68914e5aae9e838825b84daa + Openai-Processing-Ms: + - '275' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '277' + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=MpuILeElqrljrDl2kgh.q2CvWdew2aULy1VsYpIJLNw-1764266261-1.0.1.1-FOnzEQiiVXTlMCOaCE7fFSD2AiAu_M8r__8x6H8TAdj1_u0cNrAVEidH2LWeOVx022TfA4Qojh2ARuZcSxvIArc6GC0PpDUH_lV_7NpqB34; + path=/; expires=Thu, 27-Nov-25 18:27:41 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=qAK71GovjGWZA0BgqVhkKve3h_zFGkBk0nJgN3qfOzY-1764266261401-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5382624872b134-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "outputitem_69288c7485688191b3f81ee02a17a2b8", + "object": "eval.run.output_item", + "created_at": 1764265076, + "status": "pass", + "_datasource_item_content_hash": "07bd0d39b771a2e3976c536264799dc3f2b6e5e943a8d68dc3058bac176de445", + "available_includes": [], + "datasource_item": { + "input": "I love this product!", + "ground_truth": "positive" + }, + "datasource_item_id": 0, + "eval_id": "eval_69288c6f782c8191900a5cb6be3db61e", + "results": [ + { + "name": "Sentiment grader-32bb5cf1-b6c0-4030-ba29-10a1dca004ac", + "score": 1.0, + "passed": true, + "sample": { + "input": [ + { + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "role": "user", + "content": "Statement: I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "{\n \"steps\": [\n {\n \"description\": \"The statement 'I love this product!' includes the word 'love,' indicating a strong positive emotion towards the product.\",\n \"conclusion\": \"The sentiment is positive.\"\n }\n ],\n \"result\": \"positive\"\n}" + } + ], + "finish_reason": "stop", + "model": "o3-mini-2025-01-31", + "usage": { + "total_tokens": 287, + "completion_tokens": 137, + "prompt_tokens": 150, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + ], + "run_id": "evalrun_69288c70a03881919438f1de10070910", + "sample": { + "input": [ + { + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "I love this product!" + } + ], + "output": [ + { + "role": "assistant", + "content": "That's great to hear! What product are you referring to? I'd love to know more about it and what you enjoy about it!" + } + ], + "finish_reason": "stop", + "model": "gpt-4o-mini-2024-07-18", + "usage": { + "total_tokens": 48, + "completion_tokens": 26, + "prompt_tokens": 22, + "cached_tokens": 0 + }, + "error": null, + "temperature": 1.0, + "top_p": 1.0 + } + } + recorded_at: Thu, 27 Nov 2025 17:57:41 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml new file mode 100644 index 00000000..b0641740 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6f782c8191900a5cb6be3db61e/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:52 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_8024ef0cea996ec159db9512d79643bb + Openai-Processing-Ms: + - '888' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '891' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=OG6k0pEHTzDFHWR2hZstktrhqxb5xEApbyEXWLX4J8w-1764265072-1.0.1.1-0aVXMbN5qlCvwfOOYv.im0ZQN7B3cb4W52hJd2LtyueNQyeuwczOl6AHFZ5gaPajOy28Kn.BYIMKh1RqUcZZm7mEq_1KRSOaywG2rSXPviQ; + path=/; expires=Thu, 27-Nov-25 18:07:52 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=zX24zGeI3V58S_T1gJDemI.15.Ut328syKiyJ8nXK54-1764265072902-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53655af947ba0d-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c70a03881919438f1de10070910", + "object": "eval.run", + "created_at": 1764265072, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6f782c8191900a5cb6be3db61e", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6f782c8191900a5cb6be3db61e?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c70a03881919438f1de10070910", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:52 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml new file mode 100644 index 00000000..ac064fd8 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_output_items_retrieve_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:51 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_db025b54c4df6310390ca1dd18fd378a + Openai-Processing-Ms: + - '317' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '320' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=udxiDFrwu5WKkGQ9Wvj3P_U_AnoaYhAp9zKt1nDyASU-1764265071-1.0.1.1-r03vlkmGOoO_2v_HrjcqpRTyPZ6rGYjbwYzO.KNtyhCU1Ku4ZRHVSF8mczGOzNlJMakxHihZ3jobRlOUXanSLN18EwHvSiQlZ_b27rr5e.o; + path=/; expires=Thu, 27-Nov-25 18:07:51 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=qk9zb8mfB6JGfHeQNy2ogyookxBtkWpGrh_Uhe_b8zA-1764265071717-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365574b9eb3ae-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6f782c8191900a5cb6be3db61e", + "object": "eval", + "created_at": 1764265071, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-32bb5cf1-b6c0-4030-ba29-10a1dca004ac", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:51 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_retrieve.yml b/spec/fixtures/cassettes/evals_runs_retrieve.yml new file mode 100644 index 00000000..2e930997 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_retrieve.yml @@ -0,0 +1,123 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_69288c6a0c488191ac6adc60180c4d03/runs/evalrun_69288c6a90708191b86c7e82b893c846 + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:47 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_46323b933a8c92d4fb5a19503306f7e9 + Openai-Processing-Ms: + - '135' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '138' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=qweBroIdUWsc2kwidjmrBoBjh0AgfmD9HIlYa7dr9U8-1764265067-1.0.1.1-s4MB7.JR0s1NVmhpFZtvnXP.ZjY7LBRRIGftgzUzTdfSaoEYyOXOl5b46LNRRoz_RSSRyhL_aRcnfjC259bzh4VuzydeG3pQrpbPzJ4pImQ; + path=/; expires=Thu, 27-Nov-25 18:07:47 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=tN3KJHPGcEeisrKyUp1v_Qxgl0niwbz0WWiaLrsuvfI-1764265067259-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53653c9a130d8b-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6a90708191b86c7e82b893c846", + "object": "eval.run", + "created_at": 1764265066, + "status": "in_progress", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6a0c488191ac6adc60180c4d03", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6a0c488191ac6adc60180c4d03?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6a90708191b86c7e82b893c846", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:47 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml b/spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml new file mode 100644 index 00000000..82827f0e --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_retrieve_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c6a0c488191ac6adc60180c4d03/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:46 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_7376dfd4e698e39760ab9f9e861b62ba + Openai-Processing-Ms: + - '430' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '433' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=Ob0N0Dhz8GBj8HxJrxf5mkfWFLAmmMZQhy1uzMJu0ho-1764265066-1.0.1.1-VrQ2qANnuSoglCiTa7t5h0DFmGk93wFj.RDjRQoZio10n7A0..tRymAKWzWbH0LhYlfiCIMRePofe78ZAi4rtW1D.Pno7zI6O3cB2MDW_2U; + path=/; expires=Thu, 27-Nov-25 18:07:46 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=Vtmm6kZeYPTgBS0vw15wIJboU1CXLC3c04o.l7r55W4-1764265066871-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365385d0e89d9-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_69288c6a90708191b86c7e82b893c846", + "object": "eval.run", + "created_at": 1764265066, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_69288c6a0c488191ac6adc60180c4d03", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_69288c6a0c488191ac6adc60180c4d03?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_69288c6a90708191b86c7e82b893c846", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:46 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_retrieve_setup.yml b/spec/fixtures/cassettes/evals_runs_retrieve_setup.yml new file mode 100644 index 00000000..ef5954b9 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_retrieve_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:46 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_327229bac7ba039a82b7a1f8e5e9115a + Openai-Processing-Ms: + - '215' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '218' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=_zFDCgf3us1YhS..PbKumYZeV7ao_jczl7G5ViGrC70-1764265066-1.0.1.1-ugnL3IdDBgW7rjNvXjez6DPCmq0TSqhMfUqfmLGMSE.0rtQu9qKKpxPaGTFdthGabPTwXNlOsfcDSAPRKhUGODnxYZwkKwbdQiOu3xwKaDQ; + path=/; expires=Thu, 27-Nov-25 18:07:46 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=hU4FpeK1yL8DXLn7h76pYGgqJNJBbeYLu8_.K2R6ifw-1764265066186-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53653538ffeb1a-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c6a0c488191ac6adc60180c4d03", + "object": "eval", + "created_at": 1764265066, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-05bd0d3d-fee4-42ce-a2ad-6c9a8ad3d6a1", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:46 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_update.yml b/spec/fixtures/cassettes/evals_update.yml new file mode 100644 index 00000000..c44a3a0c --- /dev/null +++ b/spec/fixtures/cassettes/evals_update.yml @@ -0,0 +1,276 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_69288c67f948819192f601708f5599d9 + body: + encoding: UTF-8 + string: '{"metadata":{"modified":"true"}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 17:37:44 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_76577432037fb0cc3a49b85a478dd4ea + Openai-Processing-Ms: + - '250' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '253' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=LquyUkXNYgQfoMPEMVOLWFRNC_qD2hsiye2hDetrVF8-1764265064-1.0.1.1-9VDp9tH4CN_s_mg8W6Vfs8hkXfGh1fQy4keP3IgG719WIFoAGCpfkiaqn4PbLZHYnPziTRgbhsWHEqDbHepqoAmFdKvAx4jPAb.6gBLYRHk; + path=/; expires=Thu, 27-Nov-25 18:07:44 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=lZ7wtHiLT.0eAO_IUzVM9XwnK4S.DoHF2kk1gOfdPaE-1764265064596-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53652b4a49d85e-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c67f948819192f601708f5599d9", + "object": "eval", + "created_at": 1764265063, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-49de7e11-beb1-436c-92da-bc8611135e48", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": { + "modified": "true" + } + } + recorded_at: Thu, 27 Nov 2025 17:37:44 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_update_setup.yml b/spec/fixtures/cassettes/evals_update_setup.yml new file mode 100644 index 00000000..26fec873 --- /dev/null +++ b/spec/fixtures/cassettes/evals_update_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 17:37:44 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_74c5589fb7844e62af40aaf6e3f72341 + Openai-Processing-Ms: + - '681' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '684' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=VzK9NnpeChQPlG5syc9jH_ssnhnA1FT8uXdbcVhOGBs-1764265064-1.0.1.1-fsILAJMqlR.D5mGnLKpZkjt7NNjmn4tH0JS_Ln1ekBktkhX1ALQtvJL0rP9KGDaqARVgN_dT8bW9IpRvpp1ItulJC16qsrGkuW.8wQKwiCw; + path=/; expires=Thu, 27-Nov-25 18:07:44 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=M8ILSssH1N4.yHGKflKGVTVsLIwgAvMibM.p7ZJROas-1764265064110-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a5365249faaa62f-MAN + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_69288c67f948819192f601708f5599d9", + "object": "eval", + "created_at": 1764265063, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-49de7e11-beb1-436c-92da-bc8611135e48", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 17:37:44 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/openai/client/evals_spec.rb b/spec/openai/client/evals_spec.rb new file mode 100644 index 00000000..fde0a383 --- /dev/null +++ b/spec/openai/client/evals_spec.rb @@ -0,0 +1,274 @@ +RSpec.describe OpenAI::Client do + describe "#evals" do + let(:eval_params) do + { + name: "Sentiment Analysis", + data_source_config: { + type: "custom", + item_schema: { + type: "object", + properties: { + input: { type: "string" } + }, + required: ["input"] + }, + include_sample_schema: true + }, + testing_criteria: [ + { + type: "label_model", + model: "o3-mini", + input: [ + { role: "developer", + content: "Classify the sentiment of the following statement " \ + "as one of 'positive', 'neutral', or 'negative'" }, + { role: "user", content: "Statement: {{item.input}}" } + ], + passing_labels: ["positive"], + labels: %w[positive neutral negative], + name: "Sentiment grader" + } + ] + } + end + let(:eval_id) do + VCR.use_cassette("#{cassette} setup") do + OpenAI::Client.new.evals.create( + parameters: eval_params + )["id"] + end + end + + let(:run_params) do + { + name: "Run 1", + data_source: { + type: "completions", + input_messages: { + type: "template", + template: [ + { + role: "developer", + content: "You are a helpful assistant." + }, + { + role: "user", + content: "{{item.input}}" + } + ] + }, + model: "gpt-4o-mini", + source: { + type: "file_content", + content: [ + { + item: { + input: "I love this product!", + ground_truth: "positive" + } + } + ] + } + } + } + end + + let(:run_id) do + VCR.use_cassette("#{cassette} run setup") do + OpenAI::Client.new.evals.runs.create( + eval_id: eval_id, + parameters: run_params + )["id"] + end + end + + describe "#retrieve" do + let(:cassette) { "evals retrieve" } + let(:response) { OpenAI::Client.new.evals.retrieve(id: eval_id) } + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval") + expect(response["id"]).to eq(eval_id) + end + end + end + + describe "#create" do + let(:cassette) { "evals create" } + let(:response) do + OpenAI::Client.new.evals.create( + parameters: eval_params + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval") + expect(response["name"]).to eq("Sentiment Analysis") + end + end + end + + describe "#update" do + let(:cassette) { "evals update" } + let(:response) do + OpenAI::Client.new.evals.update( + id: eval_id, + parameters: { metadata: { modified: "true" } } + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval") + end + end + end + describe "#list", :vcr do + let(:cassette) { "evals list" } + let(:response) { OpenAI::Client.new.evals.list } + + before { eval_id } + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("list") + expect(response["data"]).to be_an(Array) + expect(response.dig("data", 0, "object")).to eq("eval") if response["data"].any? + end + end + end + + describe "#runs" do + describe "#retrieve" do + let(:cassette) { "evals runs retrieve" } + let(:response) do + OpenAI::Client.new.evals.runs.retrieve( + eval_id: eval_id, + id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run") + expect(response["id"]).to eq(run_id) + expect(response["eval_id"]).to eq(eval_id) + end + end + end + + describe "#create" do + let(:cassette) { "evals runs create" } + let(:response) do + OpenAI::Client.new.evals.runs.create( + eval_id: eval_id, + parameters: run_params + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run") + expect(response["eval_id"]).to eq(eval_id) + expect(response["name"]).to eq("Run 1") + end + end + end + + describe "#output_items" do + describe "#list", :vcr do + let(:cassette) { "evals runs output_items list" } + let(:response) do + OpenAI::Client.new.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("list") + expect(response["data"]).to be_an(Array) + end + end + end + + describe "#retrieve" do + let(:cassette) { "evals runs output_items retrieve" } + let(:output_item_id) do + OpenAI::Client.new.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id + )["data"].first["id"] + end + let(:response) do + OpenAI::Client.new.evals.runs.output_items.retrieve( + eval_id: eval_id, + run_id: run_id, + id: output_item_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run.output_item") + expect(response["id"]).to eq(output_item_id) + end + end + end + end + + describe "#cancel" do + let(:cassette) { "evals runs cancel" } + let(:response) do + OpenAI::Client.new.evals.runs.cancel( + eval_id: eval_id, + id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run") + expect(response["status"]).to eq("canceled") + end + end + end + + describe "#delete" do + let(:cassette) { "evals runs delete" } + let(:response) do + OpenAI::Client.new.evals.runs.cancel( + eval_id: eval_id, + id: run_id + ) + + OpenAI::Client.new.evals.runs.delete( + eval_id: eval_id, + id: run_id + ) + end + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.run.deleted") + end + end + end + end + + describe "#delete" do + let(:cassette) { "evals delete" } + let(:response) do + OpenAI::Client.new.evals.delete(id: eval_id) + end + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("eval.deleted") + end + end + end + end +end From 9d2f21cf28b81dc48bb22c83f5b9d4c4957931af Mon Sep 17 00:00:00 2001 From: Juan Arboleda <35846576+alzeck@users.noreply.github.com> Date: Thu, 27 Nov 2025 18:40:19 +0000 Subject: [PATCH 2/5] add list runs and docs --- README.md | 260 ++++++++++++++++ lib/openai/evals.rb | 4 + spec/fixtures/cassettes/evals_runs_list.yml | 131 +++++++++ .../cassettes/evals_runs_list_run_setup.yml | 125 ++++++++ .../cassettes/evals_runs_list_setup.yml | 277 ++++++++++++++++++ spec/openai/client/evals_spec.rb | 15 + 6 files changed, 812 insertions(+) create mode 100644 spec/fixtures/cassettes/evals_runs_list.yml create mode 100644 spec/fixtures/cassettes/evals_runs_list_run_setup.yml create mode 100644 spec/fixtures/cassettes/evals_runs_list_setup.yml diff --git a/README.md b/README.md index a2021daf..570e8e36 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,19 @@ Stream GPT-5 chats with the Responses API, initiate Realtime WebRTC conversation - [Vision in a thread](#vision-in-a-thread) - [Runs involving function tools](#runs-involving-function-tools) - [Exploring chunks used in File Search](#exploring-chunks-used-in-file-search) + - [Evals](#evals) + - [Create an Eval](#create-an-eval) + - [Retrieve an Eval](#retrieve-an-eval) + - [List Evals](#list-evals) + - [Update an Eval](#update-an-eval) + - [Delete an Eval](#delete-an-eval) + - [Create an Eval Run](#create-an-eval-run) + - [List Eval Runs](#list-eval-runs) + - [Retrieve an Eval Run](#retrieve-an-eval-run) + - [Cancel an Eval Run](#cancel-an-eval-run) + - [Delete an Eval Run](#delete-an-eval-run) + - [List Output Items](#list-output-items) + - [Retrieve an Output Item](#retrieve-an-output-item) - [Image Generation](#image-generation) - [DALL·E 2](#dalle-2) - [DALL·E 3](#dalle-3) @@ -1669,6 +1682,253 @@ end.compact client.messages.list(thread_id: thread_id) ``` +### Evals + +Evals allow you to systematically evaluate the quality and performance of your AI models. You can create evaluations with specific testing criteria, run them against your models, and analyze the results. + +#### Create an Eval + +Create an evaluation with testing criteria to assess model outputs: + +```ruby +response = client.evals.create( + parameters: { + name: "Sentiment Analysis Eval", + data_source_config: { + type: "stored_completions", + metadata: { usecase: "chatbot" } + }, + testing_criteria: [ + { + type: "label_model", + model: "o3-mini", + input: [ + { + role: "developer", + content: "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + role: "user", + content: "Statement: {{item.input}}" + } + ], + passing_labels: ["positive"], + labels: ["positive", "neutral", "negative"], + name: "Sentiment grader" + } + ], + metadata: { team: "product", version: "1.0" } + } +) +puts response["id"] +# => "eval_abc123" +``` + +#### Retrieve an Eval + +Get details about a specific evaluation: + +```ruby +eval_id = "eval_abc123" +response = client.evals.retrieve(id: eval_id) +puts response["name"] +# => "Sentiment Analysis Eval" +``` + +#### List Evals + +List all evaluations with optional pagination: + +```ruby +# List all evals +response = client.evals.list + +# List with limit +response = client.evals.list(parameters: { limit: 10 }) + +# List with pagination +response = client.evals.list(parameters: { after: "eval_abc123", limit: 20 }) +``` + +#### Update an Eval + +Update an evaluation's metadata: + +```ruby +response = client.evals.update( + id: eval_id, + parameters: { + metadata: { version: "2.0", updated: "true" } + } +) +``` + +#### Delete an Eval + +Delete an evaluation: + +```ruby +response = client.evals.delete(id: eval_id) +puts response["deleted"] +# => true +``` + +#### Create an Eval Run + +Run an evaluation against a model with test data: + +```ruby +response = client.evals.runs.create( + eval_id: eval_id, + parameters: { + name: "gpt-4o-mini baseline", + data_source: { + type: "completions", + input_messages: { + type: "template", + template: [ + { + role: "system", + content: "You are a sentiment analyzer. Respond with only: positive, neutral, or negative." + }, + { + role: "user", + content: "{{item.input}}" + } + ] + }, + sampling_params: { + temperature: 0.7, + max_completion_tokens: 50, + top_p: 1.0 + }, + model: "gpt-4o-mini", + source: { + type: "file_content", + content: [ + { + item: { + input: "I absolutely love this product! Best purchase ever.", + ground_truth: "positive" + } + }, + { + item: { + input: "This is terrible. Very disappointed.", + ground_truth: "negative" + } + }, + { + item: { + input: "It's okay, nothing special.", + ground_truth: "neutral" + } + } + ] + } + }, + metadata: { experiment: "baseline", date: "2024-01-15" } + } +) +puts response["id"] +# => "evalrun_xyz789" +``` + +#### List Eval Runs + +List all runs for a specific evaluation: + +```ruby +# List all runs +response = client.evals.runs.list(eval_id: eval_id) + +# List with limit +response = client.evals.runs.list( + eval_id: eval_id, + parameters: { limit: 10 } +) + +# List with pagination +response = client.evals.runs.list( + eval_id: eval_id, + parameters: { after: "evalrun_abc123", limit: 20 } +) +``` + +#### Retrieve an Eval Run + +Get details about a specific evaluation run: + +```ruby +run_id = "evalrun_xyz789" +response = client.evals.runs.retrieve( + eval_id: eval_id, + id: run_id +) +puts response["status"] +# => "completed" +``` + +#### Cancel an Eval Run + +Cancel a running evaluation: + +```ruby +response = client.evals.runs.cancel( + eval_id: eval_id, + id: run_id +) +puts response["status"] +# => "canceled" +``` + +#### Delete an Eval Run + +Delete an evaluation run: + +```ruby +response = client.evals.runs.delete( + eval_id: eval_id, + id: run_id +) +puts response["deleted"] +# => true +``` + +#### List Output Items + +Retrieve the output items from an evaluation run: + +```ruby +# List all output items +response = client.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id +) + +# List with pagination +response = client.evals.runs.output_items.list( + eval_id: eval_id, + run_id: run_id, + parameters: { limit: 10, after: "item_abc123" } +) +``` + +#### Retrieve an Output Item + +Get details about a specific output item: + +```ruby +output_item_id = "item_abc123" +response = client.evals.runs.output_items.retrieve( + eval_id: eval_id, + run_id: run_id, + id: output_item_id +) +puts response["status"] +# => "pass" +``` + ### Image Generation Generate images using DALL·E 2 or DALL·E 3! diff --git a/lib/openai/evals.rb b/lib/openai/evals.rb index bbbbdb95..cb6927e0 100644 --- a/lib/openai/evals.rb +++ b/lib/openai/evals.rb @@ -41,6 +41,10 @@ def retrieve(eval_id:, id:) @client.get(path: "/evals/#{eval_id}/runs/#{id}") end + def list(eval_id:, parameters: {}) + @client.get(path: "/evals/#{eval_id}/runs", parameters: parameters) + end + def cancel(eval_id:, id:) @client.post(path: "/evals/#{eval_id}/runs/#{id}/cancel") end diff --git a/spec/fixtures/cassettes/evals_runs_list.yml b/spec/fixtures/cassettes/evals_runs_list.yml new file mode 100644 index 00000000..5fdc435f --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_list.yml @@ -0,0 +1,131 @@ +--- +http_interactions: +- request: + method: get + uri: https://api.openai.com/v1/evals/eval_6928994e3c788191aa5575493ab58226/runs + body: + encoding: US-ASCII + string: '' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 200 + message: OK + headers: + Date: + - Thu, 27 Nov 2025 18:32:48 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_4014e25ebde4430ae5ff17e80598937c + Openai-Processing-Ms: + - '310' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '313' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=RCxCb9hNoDNLsT8SSz.QWaZ16OE3wz0H2OOVgObF0Hg-1764268368-1.0.1.1-xbuw3j1YAdWfl2qFpzspmbbwU220pO9LL4W14d4GqWvMorzobwUPp373M6RGG4obrYtV.kSFHFeBERs2yiVxllmvySVXHgXIeF1WqLf99zc; + path=/; expires=Thu, 27-Nov-25 19:02:48 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=69e_irZTWWiJst8RToJInC_xiIGqs6FHMv9GIZeo2cA-1764268368400-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53b5d36820cd1a-LHR + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "object": "list", + "data": [ + { + "id": "evalrun_6928994f39d88191b9a47a69de5eda51", + "object": "eval.run", + "created_at": 1764268367, + "status": "in_progress", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_6928994e3c788191aa5575493ab58226", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_6928994e3c788191aa5575493ab58226?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_6928994f39d88191b9a47a69de5eda51", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + ], + "first_id": "evalrun_6928994f39d88191b9a47a69de5eda51", + "has_more": false, + "last_id": "evalrun_6928994f39d88191b9a47a69de5eda51" + } + recorded_at: Thu, 27 Nov 2025 18:32:48 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_list_run_setup.yml b/spec/fixtures/cassettes/evals_runs_list_run_setup.yml new file mode 100644 index 00000000..8e749f19 --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_list_run_setup.yml @@ -0,0 +1,125 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals/eval_6928994e3c788191aa5575493ab58226/runs + body: + encoding: UTF-8 + string: '{"name":"Run 1","data_source":{"type":"completions","input_messages":{"type":"template","template":[{"role":"developer","content":"You + are a helpful assistant."},{"role":"user","content":"{{item.input}}"}]},"model":"gpt-4o-mini","source":{"type":"file_content","content":[{"item":{"input":"I + love this product!","ground_truth":"positive"}}]}}}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 18:32:47 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_21708b12e8268ffa47de3abd79ef2344 + Openai-Processing-Ms: + - '1031' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '1034' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=i_CIB19VZtHAv_rk8skB4htrkC04jLpYcMScJYKzV_E-1764268367-1.0.1.1-y91vztSj2kUhklMWvqJgYdGsGk3Y.SfmrPoB7FAykNzgLIYqeZaZo12Df_dARe4utNy7jWI0novwYwAHqlaiVppfxpCHqlTThEom5TV3eOY; + path=/; expires=Thu, 27-Nov-25 19:02:47 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=h_NO79XlA6.gBFloHWFMOow1Kg0nS3yu.4GAJNFY6A0-1764268367770-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53b5caee33dfb4-LHR + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "evalrun_6928994f39d88191b9a47a69de5eda51", + "object": "eval.run", + "created_at": 1764268367, + "status": "queued", + "data_source": { + "type": "completions", + "source": { + "type": "file_content", + "content": [ + { + "item": { + "input": "I love this product!", + "ground_truth": "positive" + } + } + ] + }, + "input_messages": { + "type": "template", + "template": [ + { + "type": "message", + "role": "developer", + "content": "You are a helpful assistant." + }, + { + "type": "message", + "role": "user", + "content": "{{item.input}}" + } + ] + }, + "model": "gpt-4o-mini", + "provider_credentials": null, + "modalities": null, + "sampling_params": null + }, + "error": null, + "eval_id": "eval_6928994e3c788191aa5575493ab58226", + "model": "gpt-4o-mini", + "name": "Run 1", + "per_model_usage": null, + "per_testing_criteria_results": null, + "report_url": "https://platform.openai.com/evaluations/eval_6928994e3c788191aa5575493ab58226?project_id=proj_0h5pObirNvBYj1ZWydWLSS04&run_id=evalrun_6928994f39d88191b9a47a69de5eda51", + "result_counts": { + "errored": 0, + "failed": 0, + "passed": 0, + "total": 0 + }, + "shared_with_openai": false, + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 18:32:47 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_runs_list_setup.yml b/spec/fixtures/cassettes/evals_runs_list_setup.yml new file mode 100644 index 00000000..d58f89ff --- /dev/null +++ b/spec/fixtures/cassettes/evals_runs_list_setup.yml @@ -0,0 +1,277 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/evals + body: + encoding: UTF-8 + string: '{"name":"Sentiment Analysis","data_source_config":{"type":"custom","item_schema":{"type":"object","properties":{"input":{"type":"string"}},"required":["input"]},"include_sample_schema":true},"testing_criteria":[{"type":"label_model","model":"o3-mini","input":[{"role":"developer","content":"Classify + the sentiment of the following statement as one of ''positive'', ''neutral'', + or ''negative''"},{"role":"user","content":"Statement: {{item.input}}"}],"passing_labels":["positive"],"labels":["positive","neutral","negative"],"name":"Sentiment + grader"}]}' + headers: + Content-Type: + - application/json + Authorization: + - Bearer + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + response: + status: + code: 201 + message: Created + headers: + Date: + - Thu, 27 Nov 2025 18:32:46 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Openai-Version: + - '2020-10-01' + Openai-Organization: + - user-jxm65ijkzc1qrfhc0ij8moic + X-Request-Id: + - req_7761586e9b3dc966280851030e885ca3 + Openai-Processing-Ms: + - '687' + Vary: + - Accept-Encoding + X-Envoy-Upstream-Service-Time: + - '689' + X-Openai-Proxy-Wasm: + - v0.1 + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - __cf_bm=z3MpTJQQJDq8NpDy1Mgky8PcQ917uOEFzRSZzw_TX90-1764268366-1.0.1.1-2l4GXx9_Ul92nzE08ZGg4Re_dYImAYbYaL7O5z_qzmA067mtyGUhPYNQcLmINwRl76cW14HWWE9cMJheDF7xor28wYGbW1SjWY4D4_s0wxI; + path=/; expires=Thu, 27-Nov-25 19:02:46 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=qkT5usGvz0OmPbVRs3HPJ7EptZZee9PslFmbV9UvVac-1764268366427-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - 9a53b5c47d22f816-LHR + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: |- + { + "id": "eval_6928994e3c788191aa5575493ab58226", + "object": "eval", + "created_at": 1764268366, + "data_source_config": { + "type": "custom", + "max_items": null, + "schema": { + "type": "object", + "properties": { + "item": { + "type": "object", + "properties": { + "input": { + "type": "string" + } + }, + "required": [ + "input" + ] + }, + "sample": { + "type": "object", + "properties": { + "model": { + "type": "string" + }, + "choices": { + "type": "array", + "items": { + "type": "object", + "properties": { + "message": { + "type": "object", + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ] + }, + "content": { + "type": [ + "string", + "array", + "null" + ] + }, + "refusal": { + "type": [ + "boolean", + "null" + ] + }, + "tool_calls": { + "type": [ + "array", + "null" + ], + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ] + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + }, + "id": { + "type": "string" + } + }, + "required": [ + "type", + "function", + "id" + ] + } + }, + "function_call": { + "type": [ + "object", + "null" + ], + "properties": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "role" + ] + }, + "finish_reason": { + "type": "string" + } + }, + "required": [ + "index", + "message", + "finish_reason" + ] + } + }, + "output_text": { + "type": "string" + }, + "output_json": { + "type": "object" + }, + "output_tools": { + "type": "array", + "items": { + "type": "object" + } + }, + "output_reasoning_summary": { + "type": [ + "string", + "null" + ] + }, + "output_audio": { + "type": [ + "object", + "null" + ] + }, + "input_tools": { + "type": "array", + "items": { + "type": "object" + } + } + }, + "required": [ + "model", + "choices" + ] + } + }, + "required": [ + "item", + "sample" + ] + } + }, + "name": "Sentiment Analysis", + "testing_criteria": [ + { + "id": "Sentiment grader-2e66f46a-0407-4cc5-bd42-a1c5cde44f6c", + "type": "label_model", + "grdr_id": null, + "inactive_at": null, + "input": [ + { + "type": "message", + "role": "developer", + "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" + }, + { + "type": "message", + "role": "user", + "content": "Statement: {{item.input}}" + } + ], + "labels": [ + "positive", + "neutral", + "negative" + ], + "model": "o3-mini", + "name": "Sentiment grader", + "passing_labels": [ + "positive" + ], + "sampling_params": null + } + ], + "metadata": {} + } + recorded_at: Thu, 27 Nov 2025 18:32:46 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/openai/client/evals_spec.rb b/spec/openai/client/evals_spec.rb index fde0a383..d79289c0 100644 --- a/spec/openai/client/evals_spec.rb +++ b/spec/openai/client/evals_spec.rb @@ -141,6 +141,21 @@ end describe "#runs" do + describe "#list", :vcr do + let(:cassette) { "evals runs list" } + let(:response) { OpenAI::Client.new.evals.runs.list(eval_id: eval_id) } + + before { run_id } + + it "succeeds" do + VCR.use_cassette(cassette) do + expect(response["object"]).to eq("list") + expect(response["data"]).to be_an(Array) + expect(response.dig("data", 0, "object")).to eq("eval.run") if response["data"].any? + end + end + end + describe "#retrieve" do let(:cassette) { "evals runs retrieve" } let(:response) do From cb6642d98278668f406af23ccfd91f85d4aeb422 Mon Sep 17 00:00:00 2001 From: Juan Arboleda <35846576+alzeck@users.noreply.github.com> Date: Thu, 27 Nov 2025 18:55:06 +0000 Subject: [PATCH 3/5] reduce lines for evals list --- spec/fixtures/cassettes/evals_list.yml | 5159 +----------------- spec/fixtures/cassettes/evals_list_setup.yml | 24 +- 2 files changed, 27 insertions(+), 5156 deletions(-) diff --git a/spec/fixtures/cassettes/evals_list.yml b/spec/fixtures/cassettes/evals_list.yml index 54873fc8..40a5f998 100644 --- a/spec/fixtures/cassettes/evals_list.yml +++ b/spec/fixtures/cassettes/evals_list.yml @@ -23,7 +23,7 @@ http_interactions: message: OK headers: Date: - - Thu, 27 Nov 2025 17:37:45 GMT + - Thu, 27 Nov 2025 18:53:28 GMT Content-Type: - application/json Transfer-Encoding: @@ -35,13 +35,13 @@ http_interactions: Openai-Organization: - user-jxm65ijkzc1qrfhc0ij8moic X-Request-Id: - - req_09908134b92384fc64c2e1a044fc1b8f + - req_fb43ab73c5cedacfd3da4797f457f98f Openai-Processing-Ms: - - '349' + - '219' Vary: - Accept-Encoding X-Envoy-Upstream-Service-Time: - - '365' + - '221' X-Openai-Proxy-Wasm: - v0.1 Strict-Transport-Security: @@ -49,17 +49,17 @@ http_interactions: Cf-Cache-Status: - DYNAMIC Set-Cookie: - - __cf_bm=a9lHm_ooC54LUqxpoUst1vazPY71OQaHdrCBcnTLJ8Y-1764265065-1.0.1.1-PvPVksJlQfOKII3YuqWv76rdDKlmg3Af7t.kcEILylSWKihcvR2SUFc.At3ilkSNU3DxtN1PWnAFTzSeGiuIxObyz7ifqWs2aR6jOudJDM0; - path=/; expires=Thu, 27-Nov-25 18:07:45 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=KRMsUubnpJCqt86K8dPwv6XLqF3.4SbEgig4dkt7RRI-1764269608-1.0.1.1-_G07YEtDCfSxGTL9A8U8sbKS89nOibkLGjMQX4Jldo0PVN8nGeQ2Zwi4BbbkQZeADFqI8PULxKJ4FjeGEqUXLVe4Aydb4r.pit5qX.WyHpc; + path=/; expires=Thu, 27-Nov-25 19:23:28 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=7IH9dxNsKoIkP3ir_C4yjxfvA8TzZJyRidP6c.KKy7w-1764265065684-0.0.1.1-604800000; + - _cfuvid=5YZOeZ8X7F73apsyWvgmXwsk44AQOl0d3728FzPciXA-1764269608650-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None X-Content-Type-Options: - nosniff Server: - cloudflare Cf-Ray: - - 9a5365315dbb97f8-MAN + - 9a53d41a0bc70764-MAN Alt-Svc: - h3=":443"; ma=86400 body: @@ -69,9 +69,9 @@ http_interactions: "object": "list", "data": [ { - "id": "eval_692886ca71948191a94a46ef2866fa38", + "id": "eval_69289e005d008191bc07606b2ceb522c", "object": "eval", - "created_at": 1764263626, + "created_at": 1764269568, "data_source_config": { "type": "custom", "max_items": null, @@ -141,37 +141,6 @@ http_interactions: "role" ], "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, "tool_calls": { "items": { "required": [ @@ -212,146 +181,6 @@ http_interactions: "null" ] }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-9ec23aee-5784-4532-b734-4eaa1441c1b4", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886c81f0c81918301673fd48074e3", - "object": "eval", - "created_at": 1764263624, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, "function_call": { "required": [ "name", @@ -377,4972 +206,17 @@ http_interactions: "null" ] }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, "refusal": { "type": [ "boolean", "null" ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-b605af03-d2b3-4219-aabc-bdbbd02864c3", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886c607dc819198c3b71cbf375088", - "object": "eval", - "created_at": 1764263622, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-5d95bb83-b098-4c8c-9ccc-1f327b7c680f", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886c4402c8191b16daf0a13927d55", - "object": "eval", - "created_at": 1764263620, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-a5f01088-9cd6-484c-b16c-4239b8ce247f", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886c286c08191a59229de96f2519c", - "object": "eval", - "created_at": 1764263618, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-6ab333ba-8b63-4868-ba3d-8444b9f0b788", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886c0c2e88191b175224e13418b17", - "object": "eval", - "created_at": 1764263616, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-ea13e537-3285-4116-87ea-d26232d4e433", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886bf65fc8191991a4848d677e502", - "object": "eval", - "created_at": 1764263615, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-ecffb0f6-e04c-470b-a089-71512d3f7e2e", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886be83748191aa1ecb05c5db958e", - "object": "eval", - "created_at": 1764263614, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-ec2a502e-e03d-4b4e-920f-c9996764023a", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": { - "modified": "true" - } - }, - { - "id": "eval_692886bd90d8819198391114178d4134", - "object": "eval", - "created_at": 1764263613, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-a8d9c70b-5062-40ad-b882-6d7d2b4835d7", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692886bc33dc81919ec1696f9e931ff4", - "object": "eval", - "created_at": 1764263612, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-1bbd590e-ef4f-462e-a050-094b3925482c", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_69288535779c8191a0bed3405c7a72ab", - "object": "eval", - "created_at": 1764263221, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-9f9e21ad-adcd-4f8e-b359-b9aa6a34a974", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692885332d10819180c74603e6462a4c", - "object": "eval", - "created_at": 1764263219, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-3b0cec65-bd94-41a5-a67c-894300044c4a", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_69288530792c819181f87315e2cf7a98", - "object": "eval", - "created_at": 1764263216, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-8d9f1ee8-2ca7-4ee7-9775-84fd81f188ac", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_6928852eacfc8191939c094b7bf768a3", - "object": "eval", - "created_at": 1764263214, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-261fabf9-7bb5-497b-8103-a989a3d5780a", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_6928852cfc9c81918d7e5fee3762e782", - "object": "eval", - "created_at": 1764263212, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-6b2951b4-82b6-4471-b494-2f362485c8ba", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_6928852ae6588191801b3c819b2ec864", - "object": "eval", - "created_at": 1764263210, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-27b6b470-b897-47a2-84a5-bae598fe8475", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_692885294e408191883f13c67b96cc77", - "object": "eval", - "created_at": 1764263209, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-724f7a30-b74b-49d5-8fbc-abe93f386347", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_69288527f2548191a4a1550aa8ae4962", - "object": "eval", - "created_at": 1764263207, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-0abe0176-b62b-4dd6-96a1-3f30623a53f4", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": { - "modified": "true" - } - }, - { - "id": "eval_692885254d4c8191b44f0366019b69c4", - "object": "eval", - "created_at": 1764263205, - "data_source_config": { - "type": "custom", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input" - ], - "properties": { - "input": { - "type": "string" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { - "role": { - "enum": [ - "assistant" - ], - "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] - } - }, - "type": "object" - }, - "finish_reason": { - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - "output_text": { - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - } - }, - "name": "Sentiment Analysis", - "testing_criteria": [ - { - "id": "Sentiment grader-d2a3c749-e56a-425d-8adf-8ec9ae2d2b45", - "type": "label_model", - "grdr_id": null, - "inactive_at": null, - "input": [ - { - "type": "message", - "role": "developer", - "content": "Classify the sentiment of the following statement as one of 'positive', 'neutral', or 'negative'" - }, - { - "type": "message", - "role": "user", - "content": "Statement: {{item.input}}" - } - ], - "labels": [ - "positive", - "neutral", - "negative" - ], - "model": "o3-mini", - "name": "Sentiment grader", - "passing_labels": [ - "positive" - ], - "sampling_params": null - } - ], - "metadata": {} - }, - { - "id": "eval_69288313e4408191922bf1863f2ba432", - "object": "eval", - "created_at": 1764262675, - "data_source_config": { - "type": "logs", - "max_items": null, - "schema": { - "required": [ - "item", - "sample" - ], - "properties": { - "item": { - "required": [ - "input", - "output" - ], - "title": "LogsItemSchema", - "properties": { - "output": { - "items": { - "required": [ - "model", - "output" - ], - "title": "ResponseInputSample", - "properties": { - "model": { - "title": "Model", - "type": "string" - }, - "output": { - "items": { - "required": [ - "role", - "content" - ], - "title": "ChatMessage", - "properties": { - "tool_call_id": { - "default": null, - "title": "Tool Call Id", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "finish_reason": { - "default": null, - "title": "Finish Reason", - "anyOf": [ - { - "enum": [ - "stop", - "length", - "tool_calls", - "content_filter", - "function_call" - ], - "type": "string" - }, - { - "type": "null" - } - ] - }, - "content": { - "title": "Content", - "anyOf": [ - { - "type": "string" - }, - { - "items": { - "anyOf": [ - { - "required": [ - "text", - "type" - ], - "additionalProperties": true, - "title": "ResponseInputText", - "properties": { - "text": { - "title": "Text", - "type": "string" - }, - "type": { - "const": "input_text", - "title": "Type", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "detail", - "type" - ], - "additionalProperties": true, - "title": "ResponseInputImage", - "properties": { - "file_id": { - "default": null, - "title": "File Id", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "detail": { - "title": "Detail", - "enum": [ - "low", - "high", - "auto" - ], - "type": "string" - }, - "type": { - "const": "input_image", - "title": "Type", - "type": "string" - }, - "image_url": { - "default": null, - "title": "Image Url", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - }, - "type": "object" - }, - { - "required": [ - "annotations", - "text", - "type" - ], - "additionalProperties": true, - "title": "ResponseOutputText", - "properties": { - "text": { - "title": "Text", - "type": "string" - }, - "type": { - "const": "output_text", - "title": "Type", - "type": "string" - }, - "logprobs": { - "default": null, - "title": "Logprobs", - "anyOf": [ - { - "items": { - "required": [ - "token", - "bytes", - "logprob", - "top_logprobs" - ], - "additionalProperties": true, - "title": "Logprob", - "properties": { - "bytes": { - "items": { - "type": "integer" - }, - "title": "Bytes", - "type": "array" - }, - "token": { - "title": "Token", - "type": "string" - }, - "top_logprobs": { - "items": { - "required": [ - "token", - "bytes", - "logprob" - ], - "additionalProperties": true, - "title": "LogprobTopLogprob", - "properties": { - "bytes": { - "items": { - "type": "integer" - }, - "title": "Bytes", - "type": "array" - }, - "token": { - "title": "Token", - "type": "string" - }, - "logprob": { - "title": "Logprob", - "type": "number" - } - }, - "type": "object" - }, - "title": "Top Logprobs", - "type": "array" - }, - "logprob": { - "title": "Logprob", - "type": "number" - } - }, - "type": "object" - }, - "type": "array" - }, - { - "type": "null" - } - ] - }, - "annotations": { - "items": { - "anyOf": [ - { - "required": [ - "file_id", - "filename", - "index", - "type" - ], - "additionalProperties": true, - "title": "AnnotationFileCitation", - "properties": { - "file_id": { - "title": "File Id", - "type": "string" - }, - "index": { - "title": "Index", - "type": "integer" - }, - "type": { - "const": "file_citation", - "title": "Type", - "type": "string" - }, - "filename": { - "title": "Filename", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "end_index", - "start_index", - "title", - "type", - "url" - ], - "additionalProperties": true, - "title": "AnnotationURLCitation", - "properties": { - "start_index": { - "title": "Start Index", - "type": "integer" - }, - "end_index": { - "title": "End Index", - "type": "integer" - }, - "title": { - "title": "Title", - "type": "string" - }, - "type": { - "const": "url_citation", - "title": "Type", - "type": "string" - }, - "url": { - "title": "Url", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "container_id", - "end_index", - "file_id", - "filename", - "start_index", - "type" - ], - "additionalProperties": true, - "title": "AnnotationContainerFileCitation", - "properties": { - "start_index": { - "title": "Start Index", - "type": "integer" - }, - "end_index": { - "title": "End Index", - "type": "integer" - }, - "type": { - "const": "container_file_citation", - "title": "Type", - "type": "string" - }, - "filename": { - "title": "Filename", - "type": "string" - }, - "file_id": { - "title": "File Id", - "type": "string" - }, - "container_id": { - "title": "Container Id", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "file_id", - "index", - "type" - ], - "additionalProperties": true, - "title": "AnnotationFilePath", - "properties": { - "file_id": { - "title": "File Id", - "type": "string" - }, - "index": { - "title": "Index", - "type": "integer" - }, - "type": { - "const": "file_path", - "title": "Type", - "type": "string" - } - }, - "type": "object" - } - ] - }, - "title": "Annotations", - "type": "array" - } - }, - "type": "object" - }, - { - "required": [ - "type", - "input_audio" - ], - "title": "ResponseInputAudio", - "properties": { - "type": { - "const": "input_audio", - "title": "Type", - "type": "string" - }, - "input_audio": { - "required": [ - "data" - ], - "title": "AudioData", - "properties": { - "data": { - "title": "Data", - "type": "string" - }, - "format": { - "default": "wav", - "title": "Format", - "enum": [ - "wav", - "mp3" - ], - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - }, - { - "required": [ - "type", - "output_audio" - ], - "title": "ResponseOutputAudio", - "properties": { - "audio_transcript": { - "default": null, - "title": "Audio Transcript", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "output_audio": { - "required": [ - "data" - ], - "title": "AudioData", - "properties": { - "data": { - "title": "Data", - "type": "string" - }, - "format": { - "default": "wav", - "title": "Format", - "enum": [ - "wav", - "mp3" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": { - "const": "output_audio", - "title": "Type", - "type": "string" - } - }, - "type": "object" - } - ] - }, - "type": "array" - } - ] - }, - "role": { - "title": "Role", - "enum": [ - "system", - "user", - "assistant", - "developer", - "tool", - "function" - ], - "type": "string" - }, - "tool_calls": { - "default": null, - "title": "Tool Calls", - "anyOf": [ - { - "items": { - "required": [ - "type", - "function", - "id" - ], - "title": "FunctionCall", - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "title": "Function", - "properties": { - "return_value": { - "default": null, - "title": "Return Value", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "name": { - "title": "Name", - "type": "string" - }, - "arguments": { - "title": "Arguments", - "type": "string" - } - }, - "type": "object" - }, - "id": { - "title": "Id", - "type": "string" - }, - "type": { - "const": "function", - "title": "Type", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - { - "type": "null" - } - ] - }, - "reasoning_summary": { - "default": null, - "title": "Reasoning Summary", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "name": { - "default": null, - "title": "Name", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "function_call": { - "default": null, - "anyOf": [ - { - "required": [ - "name", - "arguments" - ], - "title": "Function", - "properties": { - "return_value": { - "default": null, - "title": "Return Value", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "name": { - "title": "Name", - "type": "string" - }, - "arguments": { - "title": "Arguments", - "type": "string" - } - }, - "type": "object" - }, - { - "type": "null" - } - ] - }, - "refusal": { - "default": null, - "title": "Refusal", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "trace_id": { - "default": null, - "title": "Trace Id", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - }, - "type": "object" - }, - "title": "Output", - "type": "array" - } - }, - "type": "object" - }, - "title": "Output", - "type": "array" - }, - "input": { - "items": { - "required": [ - "role", - "content" - ], - "title": "ChatMessage", - "properties": { - "tool_call_id": { - "default": null, - "title": "Tool Call Id", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "finish_reason": { - "default": null, - "title": "Finish Reason", - "anyOf": [ - { - "enum": [ - "stop", - "length", - "tool_calls", - "content_filter", - "function_call" - ], - "type": "string" - }, - { - "type": "null" - } - ] - }, - "content": { - "title": "Content", - "anyOf": [ - { - "type": "string" }, - { - "items": { - "anyOf": [ - { - "required": [ - "text", - "type" - ], - "additionalProperties": true, - "title": "ResponseInputText", - "properties": { - "text": { - "title": "Text", - "type": "string" - }, - "type": { - "const": "input_text", - "title": "Type", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "detail", - "type" - ], - "additionalProperties": true, - "title": "ResponseInputImage", - "properties": { - "file_id": { - "default": null, - "title": "File Id", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "detail": { - "title": "Detail", - "enum": [ - "low", - "high", - "auto" - ], - "type": "string" - }, - "type": { - "const": "input_image", - "title": "Type", - "type": "string" - }, - "image_url": { - "default": null, - "title": "Image Url", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - }, - "type": "object" - }, - { - "required": [ - "annotations", - "text", - "type" - ], - "additionalProperties": true, - "title": "ResponseOutputText", - "properties": { - "text": { - "title": "Text", - "type": "string" - }, - "type": { - "const": "output_text", - "title": "Type", - "type": "string" - }, - "logprobs": { - "default": null, - "title": "Logprobs", - "anyOf": [ - { - "items": { - "required": [ - "token", - "bytes", - "logprob", - "top_logprobs" - ], - "additionalProperties": true, - "title": "Logprob", - "properties": { - "bytes": { - "items": { - "type": "integer" - }, - "title": "Bytes", - "type": "array" - }, - "token": { - "title": "Token", - "type": "string" - }, - "top_logprobs": { - "items": { - "required": [ - "token", - "bytes", - "logprob" - ], - "additionalProperties": true, - "title": "LogprobTopLogprob", - "properties": { - "bytes": { - "items": { - "type": "integer" - }, - "title": "Bytes", - "type": "array" - }, - "token": { - "title": "Token", - "type": "string" - }, - "logprob": { - "title": "Logprob", - "type": "number" - } - }, - "type": "object" - }, - "title": "Top Logprobs", - "type": "array" - }, - "logprob": { - "title": "Logprob", - "type": "number" - } - }, - "type": "object" - }, - "type": "array" - }, - { - "type": "null" - } - ] - }, - "annotations": { - "items": { - "anyOf": [ - { - "required": [ - "file_id", - "filename", - "index", - "type" - ], - "additionalProperties": true, - "title": "AnnotationFileCitation", - "properties": { - "file_id": { - "title": "File Id", - "type": "string" - }, - "index": { - "title": "Index", - "type": "integer" - }, - "type": { - "const": "file_citation", - "title": "Type", - "type": "string" - }, - "filename": { - "title": "Filename", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "end_index", - "start_index", - "title", - "type", - "url" - ], - "additionalProperties": true, - "title": "AnnotationURLCitation", - "properties": { - "start_index": { - "title": "Start Index", - "type": "integer" - }, - "end_index": { - "title": "End Index", - "type": "integer" - }, - "title": { - "title": "Title", - "type": "string" - }, - "type": { - "const": "url_citation", - "title": "Type", - "type": "string" - }, - "url": { - "title": "Url", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "container_id", - "end_index", - "file_id", - "filename", - "start_index", - "type" - ], - "additionalProperties": true, - "title": "AnnotationContainerFileCitation", - "properties": { - "start_index": { - "title": "Start Index", - "type": "integer" - }, - "end_index": { - "title": "End Index", - "type": "integer" - }, - "type": { - "const": "container_file_citation", - "title": "Type", - "type": "string" - }, - "filename": { - "title": "Filename", - "type": "string" - }, - "file_id": { - "title": "File Id", - "type": "string" - }, - "container_id": { - "title": "Container Id", - "type": "string" - } - }, - "type": "object" - }, - { - "required": [ - "file_id", - "index", - "type" - ], - "additionalProperties": true, - "title": "AnnotationFilePath", - "properties": { - "file_id": { - "title": "File Id", - "type": "string" - }, - "index": { - "title": "Index", - "type": "integer" - }, - "type": { - "const": "file_path", - "title": "Type", - "type": "string" - } - }, - "type": "object" - } - ] - }, - "title": "Annotations", - "type": "array" - } - }, - "type": "object" - }, - { - "required": [ - "type", - "input_audio" - ], - "title": "ResponseInputAudio", - "properties": { - "type": { - "const": "input_audio", - "title": "Type", - "type": "string" - }, - "input_audio": { - "required": [ - "data" - ], - "title": "AudioData", - "properties": { - "data": { - "title": "Data", - "type": "string" - }, - "format": { - "default": "wav", - "title": "Format", - "enum": [ - "wav", - "mp3" - ], - "type": "string" - } - }, - "type": "object" - } - }, - "type": "object" - }, - { - "required": [ - "type", - "output_audio" - ], - "title": "ResponseOutputAudio", - "properties": { - "audio_transcript": { - "default": null, - "title": "Audio Transcript", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "output_audio": { - "required": [ - "data" - ], - "title": "AudioData", - "properties": { - "data": { - "title": "Data", - "type": "string" - }, - "format": { - "default": "wav", - "title": "Format", - "enum": [ - "wav", - "mp3" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": { - "const": "output_audio", - "title": "Type", - "type": "string" - } - }, - "type": "object" - } - ] - }, - "type": "array" - } - ] - }, - "role": { - "title": "Role", - "enum": [ - "system", - "user", - "assistant", - "developer", - "tool", - "function" - ], - "type": "string" - }, - "tool_calls": { - "default": null, - "title": "Tool Calls", - "anyOf": [ - { - "items": { - "required": [ - "type", - "function", - "id" - ], - "title": "FunctionCall", - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "title": "Function", - "properties": { - "return_value": { - "default": null, - "title": "Return Value", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "name": { - "title": "Name", - "type": "string" - }, - "arguments": { - "title": "Arguments", - "type": "string" - } - }, - "type": "object" - }, - "id": { - "title": "Id", - "type": "string" - }, - "type": { - "const": "function", - "title": "Type", - "type": "string" - } - }, - "type": "object" - }, - "type": "array" - }, - { - "type": "null" - } - ] - }, - "reasoning_summary": { - "default": null, - "title": "Reasoning Summary", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "name": { - "default": null, - "title": "Name", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "function_call": { - "default": null, - "anyOf": [ - { - "required": [ - "name", - "arguments" - ], - "title": "Function", - "properties": { - "return_value": { - "default": null, - "title": "Return Value", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "name": { - "title": "Name", - "type": "string" - }, - "arguments": { - "title": "Arguments", - "type": "string" - } - }, - "type": "object" - }, - { - "type": "null" - } - ] - }, - "refusal": { - "default": null, - "title": "Refusal", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "trace_id": { - "default": null, - "title": "Trace Id", - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ] - } - }, - "type": "object" - }, - "title": "Input", - "type": "array" - } - }, - "type": "object" - }, - "sample": { - "required": [ - "model", - "choices" - ], - "properties": { - "output_audio": { - "type": [ - "object", - "null" - ] - }, - "output_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "model": { - "type": "string" - }, - "input_tools": { - "items": { - "type": "object" - }, - "type": "array" - }, - "output_json": { - "type": "object" - }, - "output_reasoning_summary": { - "type": [ - "string", - "null" - ] - }, - "choices": { - "items": { - "required": [ - "index", - "message", - "finish_reason" - ], - "properties": { - "message": { - "required": [ - "role" - ], - "properties": { "role": { "enum": [ "assistant" ], "type": "string" - }, - "function_call": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": [ - "object", - "null" - ] - }, - "content": { - "type": [ - "string", - "array", - "null" - ] - }, - "tool_calls": { - "items": { - "required": [ - "type", - "function", - "id" - ], - "properties": { - "function": { - "required": [ - "name", - "arguments" - ], - "properties": { - "name": { - "type": "string" - }, - "arguments": { - "type": "string" - } - }, - "type": "object" - }, - "id": { - "type": "string" - }, - "type": { - "enum": [ - "function" - ], - "type": "string" - } - }, - "type": "object" - }, - "type": [ - "array", - "null" - ] - }, - "refusal": { - "type": [ - "boolean", - "null" - ] } }, "type": "object" @@ -5363,15 +237,12 @@ http_interactions: } }, "type": "object" - }, - "metadata": { - "usecase": "chatbot" } }, "name": "Sentiment Analysis", "testing_criteria": [ { - "id": "Sentiment grader-8dee6df0-c9c3-4d4f-b500-c21b54aab4c9", + "id": "Sentiment grader-88f5e332-d4f9-4843-aae1-918789dba587", "type": "label_model", "grdr_id": null, "inactive_at": null, @@ -5403,9 +274,9 @@ http_interactions: "metadata": {} } ], - "first_id": "eval_692886ca71948191a94a46ef2866fa38", - "has_more": true, - "last_id": "eval_69288313e4408191922bf1863f2ba432" + "first_id": "eval_69289e005d008191bc07606b2ceb522c", + "has_more": false, + "last_id": "eval_69289e005d008191bc07606b2ceb522c" } - recorded_at: Thu, 27 Nov 2025 17:37:45 GMT + recorded_at: Thu, 27 Nov 2025 18:53:28 GMT recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/cassettes/evals_list_setup.yml b/spec/fixtures/cassettes/evals_list_setup.yml index 8d4d6769..36331d54 100644 --- a/spec/fixtures/cassettes/evals_list_setup.yml +++ b/spec/fixtures/cassettes/evals_list_setup.yml @@ -26,7 +26,7 @@ http_interactions: message: Created headers: Date: - - Thu, 27 Nov 2025 17:37:45 GMT + - Thu, 27 Nov 2025 18:52:48 GMT Content-Type: - application/json Transfer-Encoding: @@ -38,13 +38,13 @@ http_interactions: Openai-Organization: - user-jxm65ijkzc1qrfhc0ij8moic X-Request-Id: - - req_ad59a4421109d03bf16f0dd5d568cb43 + - req_60d32089805131c0071ce9eccb04177f Openai-Processing-Ms: - - '187' + - '550' Vary: - Accept-Encoding X-Envoy-Upstream-Service-Time: - - '189' + - '552' X-Openai-Proxy-Wasm: - v0.1 Strict-Transport-Security: @@ -52,26 +52,26 @@ http_interactions: Cf-Cache-Status: - DYNAMIC Set-Cookie: - - __cf_bm=2URRA0QzprNNuNe_uY_bAL3M.w.SeU2zUU2uu7Rgmz0-1764265065-1.0.1.1-vCfd9v8Qx1oKdvy8E2piuOxlGUK3rPotuNjTxl6IQYJ5WVaFqosWj4aLGxWLszSIoI0n04TnlJXIP.7QK4mv0e4P5vPOsuKUcXtA8Li09Yc; - path=/; expires=Thu, 27-Nov-25 18:07:45 GMT; domain=.api.openai.com; HttpOnly; + - __cf_bm=eFEINPHuc34CMCYL.SDiqLlyW4rJ.vafcOPwqC2BIDE-1764269568-1.0.1.1-CSW4VNVaa1sxqRqpfLPzzndPm4jSUTy_Rv8_rGTkh1X7cM1u91acPyUt_uboA44es5iyv.HJrprMsbi1okkD596sIKD189Iw3ijBcKj2i0A; + path=/; expires=Thu, 27-Nov-25 19:22:48 GMT; domain=.api.openai.com; HttpOnly; Secure; SameSite=None - - _cfuvid=YlC8_GZoyq1p_SeriLDaxuZ8sWYRFXIDpqQJzx6Rz.4-1764265065064-0.0.1.1-604800000; + - _cfuvid=1N0rpQDL2dtXgQFGNlVf.SVMbsPNPJg58wihf2nOrvM-1764269568498-0.0.1.1-604800000; path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None X-Content-Type-Options: - nosniff Server: - cloudflare Cf-Ray: - - 9a53652e5cc9b11c-MAN + - 9a53d31ebd220764-MAN Alt-Svc: - h3=":443"; ma=86400 body: encoding: ASCII-8BIT string: |- { - "id": "eval_69288c68ed50819185ed5845facf0be6", + "id": "eval_69289e005d008191bc07606b2ceb522c", "object": "eval", - "created_at": 1764265064, + "created_at": 1764269568, "data_source_config": { "type": "custom", "max_items": null, @@ -242,7 +242,7 @@ http_interactions: "name": "Sentiment Analysis", "testing_criteria": [ { - "id": "Sentiment grader-cd6a1097-7691-4502-a5f0-49c7e0043428", + "id": "Sentiment grader-88f5e332-d4f9-4843-aae1-918789dba587", "type": "label_model", "grdr_id": null, "inactive_at": null, @@ -273,5 +273,5 @@ http_interactions: ], "metadata": {} } - recorded_at: Thu, 27 Nov 2025 17:37:45 GMT + recorded_at: Thu, 27 Nov 2025 18:52:48 GMT recorded_with: VCR 6.3.1 From b543a4ddfe9715cae826f1a94eafbecf40ab4f49 Mon Sep 17 00:00:00 2001 From: Juan Arboleda <35846576+alzeck@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:03:32 +0000 Subject: [PATCH 4/5] add supported endpoints --- README.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 570e8e36..3c5c4e37 100644 --- a/README.md +++ b/README.md @@ -1684,7 +1684,21 @@ client.messages.list(thread_id: thread_id) ### Evals -Evals allow you to systematically evaluate the quality and performance of your AI models. You can create evaluations with specific testing criteria, run them against your models, and analyze the results. +The [Evals Api](https://platform.openai.com/docs/api-reference/evals) allow you to systematically evaluate the quality and performance of your AI models. + +**Supported Endpoints:** +- `POST /v1/evals` - Create an evaluation +- `GET /v1/evals/{id}` - Retrieve an evaluation +- `GET /v1/evals` - List evaluations +- `POST /v1/evals/{id}` - Update an evaluation +- `DELETE /v1/evals/{id}` - Delete an evaluation +- `POST /v1/evals/{id}/runs` - Create an evaluation run +- `GET /v1/evals/{id}/runs/{run_id}` - Retrieve an evaluation run +- `GET /v1/evals/{id}/runs` - List evaluation runs +- `POST /v1/evals/{id}/runs/{run_id}/cancel` - Cancel an evaluation run +- `DELETE /v1/evals/{id}/runs/{run_id}` - Delete an evaluation run +- `GET /v1/evals/{id}/runs/{run_id}/output_items` - List output items +- `GET /v1/evals/{id}/runs/{run_id}/output_items/{item_id}` - Retrieve an output item #### Create an Eval From dbabfd5496818e672a80ad37f0626043efca5fba Mon Sep 17 00:00:00 2001 From: Juan Arboleda <35846576+alzeck@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:18:14 +0000 Subject: [PATCH 5/5] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c5c4e37..3a26fb41 100644 --- a/README.md +++ b/README.md @@ -1684,7 +1684,7 @@ client.messages.list(thread_id: thread_id) ### Evals -The [Evals Api](https://platform.openai.com/docs/api-reference/evals) allow you to systematically evaluate the quality and performance of your AI models. +The [Evals API](https://platform.openai.com/docs/api-reference/evals) allows you to systematically evaluate the quality and performance of your AI models. **Supported Endpoints:** - `POST /v1/evals` - Create an evaluation