langchain-ai · jacoblee93 · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025 · Feb 8, 2025
diff --git a/python/langsmith/testing/_internal.py b/python/langsmith/testing/_internal.py
@@ -1,4 +1,4 @@
 from __future__ import annotations

 import atexit
 import contextlib
@@ -69,6 +69,7 @@
     *,
     id: Optional[uuid.UUID] = None,
     output_keys: Optional[Sequence[str]] = None,
+    reference_output_keys: Optional[Sequence[str]] = None,
     client: Optional[ls_client.Client] = None,
     test_suite_name: Optional[str] = None,
 ) -> Callable[[Callable], Callable]: ...
@@ -86,9 +87,13 @@
         - id (Optional[uuid.UUID]): A unique identifier for the test case. If not
             provided, an ID will be generated based on the test function's module
             and name.
-        - output_keys (Optional[Sequence[str]]): A list of keys to be considered as
-            the output keys for the test case. These keys will be extracted from the
-            test function's inputs and stored as the expected outputs.
+        - output_keys (Optional[Sequence[str]], deprecated): Use
+            "reference_output_keys" instead. A list of keys to be considered as the
+            output keys for the test case.
+        - reference_output_keys (Optional[Sequence[str]]): A list of keys to be
+            considered as the reference output keys for the test case. These keys
+            will be extracted from the test function's inputs and stored as the
+            expected outputs.
         - client (Optional[ls_client.Client]): An instance of the LangSmith client
             to be used for communication with the LangSmith service. If not provided,
             a default client will be used.
@@ -238,7 +243,7 @@
             import pytest
 
 
-            @pytest.mark.langsmith(output_keys=["expected"])
+            @pytest.mark.langsmith(reference_output_keys=["expected"])
             @pytest.mark.parametrize(
                 "a, b, expected",
                 [
@@ -266,7 +271,7 @@
                 assert 3 * 4 == 12
 
         By default, all test inputs are saved as "inputs" to a dataset.
-        You can specify the `output_keys` argument to persist those keys
+        You can specify the `reference_output_keys` argument to persist those keys
         within the dataset's "outputs" fields.
 
         .. code-block:: python
@@ -279,7 +284,7 @@
                 return "input"
 
 
-            @pytest.mark.langsmith(output_keys=["expected_output"])
+            @pytest.mark.langsmith(reference_output_keys=["expected_output"])
             def test_with_expected_output(some_input: str, expected_output: str):
                 assert expected_output in some_input
 
@@ -297,9 +302,18 @@
             test_openai_says_hello()
             test_addition_with_multiple_inputs(1, 2, 3)
     """
+    if "output_keys" in kwargs:
+        warnings.warn(
+            "The `output_keys` keyword argument is deprecated."
+            "Please use `reference_output_keys` instead.",
+            DeprecationWarning,
+        )
+        reference_output_keys = kwargs.pop("output_keys")
+    else:
+        reference_output_keys = kwargs.pop("reference_output_keys", None)
     langtest_extra = _UTExtra(
         id=kwargs.pop("id", None),
-        output_keys=kwargs.pop("output_keys", None),
+        reference_output_keys=reference_output_keys,
         client=kwargs.pop("client", None),
         test_suite_name=kwargs.pop("test_suite_name", None),
         cache=ls_utils.get_cache_dir(kwargs.pop("cache", None)),
@@ -691,6 +705,8 @@
         self.pytest_plugin = pytest_plugin
         self.pytest_nodeid = pytest_nodeid
         self._logged_reference_outputs: Optional[dict] = None
+        self.inputs = inputs
+        self.reference_outputs = reference_outputs
 
         if pytest_plugin and pytest_nodeid:
             pytest_plugin.add_process_to_test_suite(
@@ -787,7 +803,7 @@
 class _UTExtra(TypedDict, total=False):
     client: Optional[ls_client.Client]
     id: Optional[uuid.UUID]
-    output_keys: Optional[Sequence[str]]
+    reference_output_keys: Optional[Sequence[str]]
     test_suite_name: Optional[str]
     cache: Optional[str]
 
@@ -808,19 +824,19 @@
     **kwargs: Any,
 ) -> _TestCase:
     client = langtest_extra["client"] or rt.get_cached_client()
-    output_keys = langtest_extra["output_keys"]
+    reference_output_keys = langtest_extra["reference_output_keys"]
     signature = inspect.signature(func)
     inputs = rh._get_inputs_safe(signature, *args, **kwargs) or None
     outputs = None
-    if output_keys:
+    if reference_output_keys:
         outputs = {}
         if not inputs:
             msg = (
-                "'output_keys' should only be specified when marked test function has "
-                "input arguments."
+                "`reference_output_keys` should only be specified when",
+                "marked test function has input arguments.",
             )
             raise ValueError(msg)
-        for k in output_keys:
+        for k in reference_output_keys:
             outputs[k] = inputs.pop(k, None)
     test_suite = _LangSmithTestSuite.from_test(
         client, func, langtest_extra.get("test_suite_name")
@@ -866,16 +882,14 @@
         langtest_extra=langtest_extra,
     )
     _TEST_CASE.set(test_case)
-    func_sig = inspect.signature(func)
-    func_inputs = rh._get_inputs_safe(func_sig, *test_args, **test_kwargs)
 
     def _test():
         test_case.start_time()
         with rh.trace(
             name=getattr(func, "__name__", "Test"),
             run_id=test_case.run_id,
             reference_example_id=test_case.example_id,
-            inputs=func_inputs,
+            inputs=test_case.inputs,
             project_name=test_case.test_suite.name,
             exceptions_to_handle=(SkipException,),
             _end_on_exit=False,
@@ -936,16 +950,14 @@
         langtest_extra=langtest_extra,
     )
     _TEST_CASE.set(test_case)
-    func_sig = inspect.signature(func)
-    func_inputs = rh._get_inputs_safe(func_sig, *test_args, **test_kwargs)
 
     async def _test():
         test_case.start_time()
         with rh.trace(
             name=getattr(func, "__name__", "Test"),
             run_id=test_case.run_id,
             reference_example_id=test_case.example_id,
-            inputs=func_inputs,
+            inputs=test_case.inputs,
             project_name=test_case.test_suite.name,
             exceptions_to_handle=(SkipException,),
             _end_on_exit=False,

diff --git a/python/tests/evaluation/test_decorator.py b/python/tests/evaluation/test_decorator.py
@@ -59,7 +59,7 @@ async def test_openai_says_hello():
     reason="LANGSMITH_TRACING environment variable not set",
 )
 @pytest.mark.xfail(reason="Test failure output case")
-@pytest.mark.langsmith(output_keys=["expected"])
+@pytest.mark.langsmith(reference_output_keys=["expected"])
 @pytest.mark.parametrize(
     "a, b, expected",
     [
@@ -98,7 +98,7 @@ def reference_outputs() -> int:
     not os.getenv("LANGSMITH_TRACING"),
     reason="LANGSMITH_TRACING environment variable not set",
 )
-@pytest.mark.langsmith(output_keys=["reference_outputs"])
+@pytest.mark.langsmith(reference_output_keys=["reference_outputs"])
 def test_fixture(inputs: int, reference_outputs: int):
     result = 2 * inputs
     t.log_outputs({"d": result})