Skip to content

Commit c07500e

Browse files
baskaryanisahers1
andauthored
python[minor]: pytest integration (#1362)
Log pytest tests to LangSmith. Useful for: - Evaluations where the eval logic is different for each datapoint, making it difficult to use generic evaluators on a whole dataset - Unit tests where you want both the local pytest experience and the ability to trace and share results ### Install ```bash pip install "langsmith[pytest]==0.2.11rc14" ``` ### Simple usage ```python # tests/test_simple.py import pytest from langsmith import testing as t @pytest.mark.langsmith def test_addition_single(): x = 3 y = 4 # directly log example inputs if you don't want to use fixtures t.log_inputs({"x": x, "y": y}) expected = 7 # directly log example outputs if you don't want to use fixtures t.log_reference_outputs({"sum": expected}) actual = x + y # directly log run outputs t.log_outputs({"sum": actual}) # test pass/fail automatically logged to langsmith assert actual == expected ``` Run ```bash pytest --outputs='ls' tests/test_foo.py ``` Results <img width="952" alt="Screenshot 2025-01-08 at 2 53 04 AM" src="https://github.com/user-attachments/assets/d695747b-cfdf-4248-b5fd-c5c420aa92ec" /> ### Advanced usage ```python #tests/test_advanced.py import openai import pytest from langsmith import wrappers from langsmith import testing as t oai_client = wrappers.wrap_openai(openai.Client()) @pytest.mark.langsmith def test_openai_says_hello(): # Traced code will be included in the test case text = "Say hello!" response = oai_client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": text}, ], ) t.log_inputs({"text": text}) t.log_outputs({"response": response.choices[0].message.content}) t.log_reference_outputs({"response": "hello!"}) # Use this context manager to trace any steps used for generating evaluation # feedback separately from the main application logic with t.trace_feedback(): grade = oai_client.chat.completions.create( model="gpt-4o-mini", messages=[ { "role": "system", "content": "Return 1 if 'hello' is in the user message and 0 otherwise.", }, {"role": "user", "content": response.choices[0].message.content}, ], ) t.log_feedback( key="llm_judge", score=float(grade.choices[0].message.content) ) assert "hello" in response.choices[0].message.content.lower() @pytest.mark.langsmith(output_keys=["expected"]) @pytest.mark.parametrize( "a, b, expected", [(1, 2, 3), (3, 4, 7)], ) def test_addition_parametrized(a: int, b: int, expected: int): t.log_outputs({"sum": a + b}) assert a + b == expected ``` Run using pytest-xdist to parallelize (`pip install pytest-xdist first`) ```bash LANGSMITH_TEST_SUITE="Test suite name" LANGSMITH_EXPERIMENT="Experiment name" pytest --outputs='ls' tests ``` Results: https://dev.smith.langchain.com/public/cea0e7fd-2d27-47d1-8ada-141069acdf0d/d <img width="1030" alt="Screenshot 2025-01-08 at 3 07 51 AM" src="https://github.com/user-attachments/assets/db770d00-67b3-4b53-8e33-289c4f0edfeb" /> --------- Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
1 parent b812149 commit c07500e

File tree

13 files changed

+2643
-1633
lines changed

13 files changed

+2643
-1633
lines changed

python/docs/create_api_rst.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,18 +108,6 @@ def _load_module_members(module_path: str, namespace: str) -> ModuleMembers:
108108
else "Pydantic" if issubclass(type_, BaseModel) else "Regular"
109109
)
110110
)
111-
# if hasattr(type_, "__slots__"):
112-
# for func_name, func_type in inspect.getmembers(type_):
113-
# if inspect.isfunction(func_type):
114-
# functions.append(
115-
# FunctionInfo(
116-
# name=func_name,
117-
# qualified_name=f"{namespace}.{name}.{func_name}",
118-
# is_public=not func_name.startswith("_"),
119-
# is_deprecated=".. deprecated::"
120-
# in (func_type.__doc__ or ""),
121-
# )
122-
# )
123111
classes_.append(
124112
ClassInfo(
125113
name=name,
@@ -156,7 +144,7 @@ def _load_package_modules(
156144
if file_path.name not in {
157145
"_runner.py",
158146
"_arunner.py",
159-
"_testing.py",
147+
"_internal.py",
160148
"_expect.py",
161149
"_openai.py",
162150
}:
@@ -200,6 +188,7 @@ def _load_package_modules(
200188
"utils",
201189
"anonymizer",
202190
"wrappers",
191+
"testing",
203192
]
204193

205194

@@ -387,20 +376,17 @@ def _build_index(package_version: str) -> None:
387376
| [AsyncClient](async_client/langsmith.async_client.AsyncClient) | Asynchronous client for interacting with the LangSmith API. |
388377
| [traceable](run_helpers/langsmith.run_helpers.traceable) | Wrapper/decorator for tracing any function. |
389378
| [wrap_openai](wrappers/langsmith.wrappers._openai.wrap_openai) | Wrapper for OpenAI client, adds LangSmith tracing to all OpenAI calls. |
390-
| [evaluate](evaluation/langsmith.evaluation._runner.evaluate) | Evaluate an application on a dataset. |
391-
| [aevaluate](evaluation/langsmith.evaluation._arunner.aevaluate) | Asynchronously evaluate an application on a dataset. |
392-
| [unit](_testing/langsmith._testing.unit) | Create a LangSmith unit test. |
379+
| [@pytest.mark.langsmith](/testing/langsmith.testing._internal.test) | LangSmith pytest integration. |
393380
394381
```{{toctree}}
395382
:maxdepth: 2
396383
:hidden:
397384
398385
client<client>
399386
async_client<async_client>
400-
evaluation<evaluation>
401387
run_helpers<run_helpers>
402388
wrappers<wrappers>
403-
_testing<_testing>
389+
testing<testing>
404390
```
405391
406392
"""

python/langsmith/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
if TYPE_CHECKING:
77
from langsmith._expect import expect
8-
from langsmith._testing import test, unit
98
from langsmith.async_client import AsyncClient
109
from langsmith.client import Client
1110
from langsmith.evaluation import aevaluate, evaluate
@@ -18,6 +17,7 @@
1817
tracing_context,
1918
)
2019
from langsmith.run_trees import RunTree
20+
from langsmith.testing._internal import test, unit
2121
from langsmith.utils import (
2222
ContextThreadPoolExecutor,
2323
)
@@ -63,7 +63,7 @@ def __getattr__(name: str) -> Any:
6363
return traceable
6464

6565
elif name == "test":
66-
from langsmith._testing import test
66+
from langsmith.testing._internal import test
6767

6868
return test
6969

@@ -104,7 +104,7 @@ def __getattr__(name: str) -> Any:
104104
return get_current_run_tree
105105

106106
elif name == "unit":
107-
from langsmith._testing import unit
107+
from langsmith.testing._internal import unit
108108

109109
return unit
110110
elif name == "ContextThreadPoolExecutor":

python/langsmith/_expect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ def value(self, value: Any) -> _Matcher:
379379

380380
def score(
381381
self,
382-
score: Union[float, int],
382+
score: Union[float, int, bool],
383383
*,
384384
key: str = "score",
385385
source_run_id: Optional[ls_client.ID_TYPE] = None,

python/langsmith/_internal/_beta_decorator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ class LangSmithBetaWarning(UserWarning):
88

99

1010
@functools.lru_cache(maxsize=100)
11-
def _warn_once(message: str) -> None:
12-
warnings.warn(message, LangSmithBetaWarning, stacklevel=2)
11+
def _warn_once(message: str, stacklevel: int = 2) -> None:
12+
warnings.warn(message, LangSmithBetaWarning, stacklevel=stacklevel)
1313

1414

1515
def warn_beta(func: Callable) -> Callable:
1616
@functools.wraps(func)
1717
def wrapper(*args, **kwargs):
18-
_warn_once(f"Function {func.__name__} is in beta.")
18+
_warn_once(f"Function {func.__name__} is in beta.", stacklevel=3)
1919
return func(*args, **kwargs)
2020

2121
return wrapper

0 commit comments

Comments
 (0)