Skip to content

Commit b2fd1c8

Browse files
python[minor], js[patch]: release py0.2.0, js0.2.9 (#1247)
v0.2 --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
1 parent 59a8c09 commit b2fd1c8

File tree

13 files changed

+132
-133
lines changed

13 files changed

+132
-133
lines changed

js/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "langsmith",
3-
"version": "0.2.8",
3+
"version": "0.2.9",
44
"description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
55
"packageManager": "yarn@1.22.19",
66
"files": [

js/src/evaluation/_runner.ts

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,17 @@ export type SummaryEvaluatorT =
5858
| DeprecatedSyncSummaryEvaluator
5959
| DeprecatedAsyncSummaryEvaluator
6060
| ((args: {
61-
runs?: Array<Run>;
62-
examples?: Array<Example>;
63-
inputs?: Array<Record<string, any>>;
64-
outputs?: Array<Record<string, any>>;
61+
runs: Array<Run>;
62+
examples: Array<Example>;
63+
inputs: Array<Record<string, any>>;
64+
outputs: Array<Record<string, any>>;
6565
referenceOutputs?: Array<Record<string, any>>;
6666
}) => EvaluationResult | EvaluationResults)
6767
| ((args: {
68-
runs?: Array<Run>;
69-
examples?: Array<Example>;
70-
inputs?: Array<Record<string, any>>;
71-
outputs?: Array<Record<string, any>>;
68+
runs: Array<Run>;
69+
examples: Array<Example>;
70+
inputs: Array<Record<string, any>>;
71+
outputs: Array<Record<string, any>>;
7272
referenceOutputs?: Array<Record<string, any>>;
7373
}) => Promise<EvaluationResult | EvaluationResults>);
7474

@@ -93,17 +93,17 @@ export type EvaluatorT =
9393
| DeprecatedFunctionEvaluator
9494
| DeprecatedAsyncFunctionEvaluator
9595
| ((args: {
96-
run?: Run;
97-
example?: Example;
98-
inputs?: Record<string, any>;
99-
outputs?: Record<string, any>;
96+
run: Run;
97+
example: Example;
98+
inputs: Record<string, any>;
99+
outputs: Record<string, any>;
100100
referenceOutputs?: Record<string, any>;
101101
}) => EvaluationResult | EvaluationResults)
102102
| ((args: {
103-
run?: Run;
104-
example?: Example;
105-
inputs?: Record<string, any>;
106-
outputs?: Record<string, any>;
103+
run: Run;
104+
example: Example;
105+
inputs: Record<string, any>;
106+
outputs: Record<string, any>;
107107
referenceOutputs?: Record<string, any>;
108108
}) => Promise<EvaluationResult | EvaluationResults>);
109109

@@ -130,11 +130,6 @@ interface _ExperimentManagerArgs {
130130
}
131131

132132
type BaseEvaluateOptions = {
133-
/**
134-
* The dataset to evaluate on. Can be a dataset name, a list of
135-
* examples, or a generator of examples.
136-
*/
137-
data: DataT;
138133
/**
139134
* Metadata to attach to the experiment.
140135
* @default undefined
@@ -178,6 +173,11 @@ export interface EvaluateOptions extends BaseEvaluateOptions {
178173
* @default undefined
179174
*/
180175
summaryEvaluators?: Array<SummaryEvaluatorT>;
176+
/**
177+
* The dataset to evaluate on. Can be a dataset name, a list of
178+
* examples, or a generator of examples.
179+
*/
180+
data: DataT;
181181
}
182182

183183
export interface ComparativeEvaluateOptions extends BaseEvaluateOptions {
@@ -934,8 +934,10 @@ async function _evaluate(
934934
);
935935

936936
let manager = await new _ExperimentManager({
937-
data: Array.isArray(fields.data) ? undefined : fields.data,
938-
examples: Array.isArray(fields.data) ? fields.data : undefined,
937+
data: Array.isArray(standardFields.data) ? undefined : standardFields.data,
938+
examples: Array.isArray(standardFields.data)
939+
? standardFields.data
940+
: undefined,
939941
client,
940942
metadata: fields.metadata,
941943
experiment: experiment_ ?? fields.experimentPrefix,
@@ -1063,10 +1065,12 @@ function _resolveData(
10631065
async function wrapSummaryEvaluators(
10641066
evaluators: SummaryEvaluatorT[],
10651067
optionsArray?: Partial<RunTreeConfig>[]
1066-
): Promise<SummaryEvaluatorT[]> {
1068+
): Promise<
1069+
Array<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator>
1070+
> {
10671071
async function _wrap(
10681072
evaluator: SummaryEvaluatorT
1069-
): Promise<SummaryEvaluatorT> {
1073+
): Promise<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator> {
10701074
const evalName = evaluator.name || "BatchEvaluator";
10711075

10721076
const wrapperInner = (
@@ -1087,10 +1091,10 @@ async function wrapSummaryEvaluators(
10871091
return Promise.resolve(
10881092
(
10891093
evaluator as (args: {
1090-
runs?: Run[];
1091-
examples?: Example[];
1092-
inputs?: Record<string, any>[];
1093-
outputs?: Record<string, any>[];
1094+
runs: Run[];
1095+
examples: Example[];
1096+
inputs: Record<string, any>[];
1097+
outputs: Record<string, any>[];
10941098
referenceOutputs?: Record<string, any>[];
10951099
}) => EvaluationResult | EvaluationResults
10961100
)({
@@ -1103,7 +1107,9 @@ async function wrapSummaryEvaluators(
11031107
);
11041108
}
11051109
// Otherwise use the traditional (runs, examples) signature
1106-
return Promise.resolve(evaluator(runs, examples));
1110+
return Promise.resolve(
1111+
(evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples)
1112+
);
11071113
},
11081114
{ ...optionsArray, name: evalName }
11091115
);
@@ -1119,7 +1125,9 @@ async function wrapSummaryEvaluators(
11191125
return wrapperInner;
11201126
}
11211127

1122-
const results: SummaryEvaluatorT[] = [];
1128+
const results: Array<
1129+
DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator
1130+
> = [];
11231131
for (let i = 0; i < evaluators.length; i++) {
11241132
results.push(await _wrap(evaluators[i]));
11251133
}

js/src/evaluation/evaluate_comparative.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ export type _ComparativeEvaluator = (args: {
7979
runs: Run[];
8080
example: Example;
8181
inputs: Record<string, any>;
82-
outputs?: Record<string, any>[];
82+
outputs: Record<string, any>[];
8383
referenceOutputs?: Record<string, any>;
8484
}) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;
8585

js/src/evaluation/evaluator.ts

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,18 +96,23 @@ export type RunEvaluatorLike =
9696
example?: Example
9797
) => Promise<EvaluationResult | EvaluationResults>)
9898
| ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
99+
| ((
100+
run: Run,
101+
example: Example
102+
) => Promise<EvaluationResult | EvaluationResults>)
103+
| ((run: Run, example: Example) => EvaluationResult | EvaluationResults)
99104
| ((args: {
100-
run?: Run;
101-
example?: Example;
102-
inputs?: Record<string, any>;
103-
outputs?: Record<string, any>;
105+
run: Run;
106+
example: Example;
107+
inputs: Record<string, any>;
108+
outputs: Record<string, any>;
104109
referenceOutputs?: Record<string, any>;
105110
}) => EvaluationResult | EvaluationResults)
106111
| ((args: {
107-
run?: Run;
108-
example?: Example;
109-
inputs?: Record<string, any>;
110-
outputs?: Record<string, any>;
112+
run: Run;
113+
example: Example;
114+
inputs: Record<string, any>;
115+
outputs: Record<string, any>;
111116
referenceOutputs?: Record<string, any>;
112117
}) => Promise<EvaluationResult | EvaluationResults>);
113118

js/src/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ export { RunTree, type RunTreeConfig } from "./run_trees.js";
1818
export { overrideFetchImplementation } from "./singletons/fetch.js";
1919

2020
// Update using yarn bump-version
21-
export const __version__ = "0.2.8";
21+
export const __version__ = "0.2.9";

js/src/tests/evaluate.int.test.ts

Lines changed: 11 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
EvaluationResults,
44
} from "../evaluation/evaluator.js";
55
import { evaluate } from "../evaluation/_runner.js";
6+
import { waitUntilRunFound } from "./utils.js";
67
import { Example, Run, TracerSession } from "../schemas.js";
78
import { Client } from "../index.js";
89
import { afterAll, beforeAll } from "@jest/globals";
@@ -1115,6 +1116,8 @@ test("evaluate handles partial summary evaluator parameters correctly", async ()
11151116
});
11161117

11171118
test("evaluate handles comparative target with ComparativeEvaluateOptions", async () => {
1119+
const client = new Client();
1120+
11181121
// First, create two experiments to compare
11191122
const targetFunc1 = (input: Record<string, any>) => {
11201123
return {
@@ -1139,13 +1142,18 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
11391142
description: "Second experiment for comparison",
11401143
});
11411144

1145+
await Promise.all(
1146+
[exp1, exp2].flatMap(({ results }) =>
1147+
results.flatMap(({ run }) => waitUntilRunFound(client, run.id))
1148+
)
1149+
);
11421150
// Create comparative evaluator
11431151
const comparativeEvaluator = ({
11441152
runs,
11451153
example,
11461154
}: {
1147-
runs?: Run[];
1148-
example?: Example;
1155+
runs: Run[];
1156+
example: Example;
11491157
}) => {
11501158
if (!runs || !example) throw new Error("Missing required parameters");
11511159

@@ -1167,7 +1175,6 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
11671175
const compareRes = await evaluate(
11681176
[exp1.experimentName, exp2.experimentName],
11691177
{
1170-
data: TESTING_DATASET_NAME,
11711178
evaluators: [comparativeEvaluator],
11721179
description: "Comparative evaluation test",
11731180
randomizeOrder: true,
@@ -1177,6 +1184,7 @@ test("evaluate handles comparative target with ComparativeEvaluateOptions", asyn
11771184

11781185
// Verify we got ComparisonEvaluationResults
11791186
expect(compareRes.experimentName).toBeDefined();
1187+
expect(compareRes.experimentName).toBeDefined();
11801188
expect(compareRes.results).toBeDefined();
11811189
expect(Array.isArray(compareRes.results)).toBe(true);
11821190

@@ -1212,59 +1220,8 @@ test("evaluate enforces correct evaluator types for comparative evaluation at ru
12121220
await expect(
12131221
// @ts-expect-error - Should error because standardEvaluator is not a ComparativeEvaluator
12141222
evaluate([exp1.experimentName, exp2.experimentName], {
1215-
data: TESTING_DATASET_NAME,
12161223
evaluators: [standardEvaluator],
12171224
description: "Should fail at runtime",
12181225
})
12191226
).rejects.toThrow(); // You might want to be more specific about the error message
12201227
});
1221-
1222-
test("evaluate comparative options includes comparative-specific fields", async () => {
1223-
const exp1 = await evaluate(
1224-
(input: Record<string, any>) => ({ foo: input.input + 1 }),
1225-
{
1226-
data: TESTING_DATASET_NAME,
1227-
}
1228-
);
1229-
1230-
const exp2 = await evaluate(
1231-
(input: Record<string, any>) => ({ foo: input.input + 2 }),
1232-
{
1233-
data: TESTING_DATASET_NAME,
1234-
}
1235-
);
1236-
1237-
const comparativeEvaluator = ({
1238-
runs,
1239-
example,
1240-
}: {
1241-
runs?: Run[];
1242-
example?: Example;
1243-
}) => {
1244-
if (!runs || !example) throw new Error("Missing required parameters");
1245-
return {
1246-
key: "comparative_score",
1247-
scores: Object.fromEntries(
1248-
runs.map((run) => [
1249-
run.id,
1250-
run.outputs?.foo === example.outputs?.output ? 1 : 0,
1251-
])
1252-
),
1253-
};
1254-
};
1255-
1256-
// Test that comparative-specific options work
1257-
const compareRes = await evaluate(
1258-
[exp1.experimentName, exp2.experimentName],
1259-
{
1260-
data: TESTING_DATASET_NAME,
1261-
evaluators: [comparativeEvaluator],
1262-
randomizeOrder: true, // Comparative-specific option
1263-
loadNested: true, // Comparative-specific option
1264-
description: "Testing comparative-specific options",
1265-
}
1266-
);
1267-
1268-
expect(compareRes.experimentName).toBeDefined();
1269-
expect(compareRes.results).toBeDefined();
1270-
});

python/langsmith/client.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5842,7 +5842,7 @@ def evaluate(
58425842
metadata: Optional[dict] = None,
58435843
experiment_prefix: Optional[str] = None,
58445844
description: Optional[str] = None,
5845-
max_concurrency: Optional[int] = None,
5845+
max_concurrency: Optional[int] = 0,
58465846
num_repetitions: int = 1,
58475847
blocking: bool = True,
58485848
experiment: Optional[EXPERIMENT_T] = None,
@@ -5861,7 +5861,7 @@ def evaluate(
58615861
metadata: Optional[dict] = None,
58625862
experiment_prefix: Optional[str] = None,
58635863
description: Optional[str] = None,
5864-
max_concurrency: Optional[int] = None,
5864+
max_concurrency: Optional[int] = 0,
58655865
num_repetitions: int = 1,
58665866
blocking: bool = True,
58675867
experiment: Optional[EXPERIMENT_T] = None,
@@ -5883,7 +5883,7 @@ def evaluate(
58835883
metadata: Optional[dict] = None,
58845884
experiment_prefix: Optional[str] = None,
58855885
description: Optional[str] = None,
5886-
max_concurrency: Optional[int] = None,
5886+
max_concurrency: Optional[int] = 0,
58875887
num_repetitions: int = 1,
58885888
blocking: bool = True,
58895889
experiment: Optional[EXPERIMENT_T] = None,
@@ -5911,7 +5911,8 @@ def evaluate(
59115911
Defaults to None.
59125912
description (str | None): A free-form text description for the experiment.
59135913
max_concurrency (int | None): The maximum number of concurrent
5914-
evaluations to run. Defaults to None (max number of workers).
5914+
evaluations to run. If None then no limit is set. If 0 then no concurrency.
5915+
Defaults to 0.
59155916
blocking (bool): Whether to block until the evaluation is complete.
59165917
Defaults to True.
59175918
num_repetitions (int): The number of times to run the evaluation.
@@ -6053,6 +6054,8 @@ def evaluate(
60536054
... summary_evaluators=[precision],
60546055
... ) # doctest: +ELLIPSIS
60556056
View the evaluation results for experiment:...
6057+
6058+
.. versionadded:: 0.2.0
60566059
""" # noqa: E501
60576060
from langsmith.evaluation._runner import evaluate as evaluate_
60586061

@@ -6094,7 +6097,7 @@ async def aevaluate(
60946097
metadata: Optional[dict] = None,
60956098
experiment_prefix: Optional[str] = None,
60966099
description: Optional[str] = None,
6097-
max_concurrency: Optional[int] = None,
6100+
max_concurrency: Optional[int] = 0,
60986101
num_repetitions: int = 1,
60996102
blocking: bool = True,
61006103
experiment: Optional[Union[schemas.TracerSession, str, uuid.UUID]] = None,
@@ -6119,8 +6122,9 @@ async def aevaluate(
61196122
experiment_prefix (Optional[str]): A prefix to provide for your experiment name.
61206123
Defaults to None.
61216124
description (Optional[str]): A description of the experiment.
6122-
max_concurrency (Optional[int]): The maximum number of concurrent
6123-
evaluations to run. Defaults to None.
6125+
max_concurrency (int | None): The maximum number of concurrent
6126+
evaluations to run. If None then no limit is set. If 0 then no concurrency.
6127+
Defaults to 0.
61246128
num_repetitions (int): The number of times to run the evaluation.
61256129
Each item in the dataset will be run and evaluated this many times.
61266130
Defaults to 1.
@@ -6259,6 +6263,9 @@ async def aevaluate(
62596263
... )
62606264
... ) # doctest: +ELLIPSIS
62616265
View the evaluation results for experiment:...
6266+
6267+
.. versionadded:: 0.2.0
6268+
62626269
""" # noqa: E501
62636270
from langsmith.evaluation._arunner import aevaluate as aevaluate_
62646271

0 commit comments

Comments
 (0)