Skip to content

Commit

Permalink
OPIK-224 [SDK] Create new LLM eval metric - G-eval (#402)
Browse files Browse the repository at this point in the history
* OPIK-224 [SDK] Create new LLM eval metric - G-eval

* rename old project name in docstrings

* extend base opik model class

* add litellm model

* extend base opik model class

* update litellm model

* migrate to litellm in models factory (now langchain is optional)

* add support of model response format messages for langchain

* calc g-eval with probabilities

* post-merge fix

* add lazy initialization fo LLM's chain of thought

* use gpt-4o model by default
  • Loading branch information
japdubengsub authored Oct 28, 2024
1 parent 1f45f05 commit 18cb086
Show file tree
Hide file tree
Showing 17 changed files with 191 additions and 12 deletions.
2 changes: 2 additions & 0 deletions sdks/python/src/opik/evaluation/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .llm_judges.answer_relevance.metric import AnswerRelevance
from .llm_judges.context_precision.metric import ContextPrecision
from .llm_judges.context_recall.metric import ContextRecall
from .llm_judges.g_eval.metric import GEval
from .llm_judges.hallucination.metric import Hallucination
from .llm_judges.moderation.metric import Moderation
from .base_metric import BaseMetric
Expand All @@ -20,6 +21,7 @@
"ContextRecall",
"Equals",
# "Factuality",
"GEval",
"Hallucination",
"IsJson",
"LevenshteinRatio",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class Contains(base_metric.BaseMetric):
name: The name of the metric. Defaults to "contains_metric".
Example:
>>> from comet_llm_eval.evaluation.metrics import Contains
>>> from opik.evaluation.metrics import Contains
>>> contains_metric = Contains(case_sensitive=True)
>>> result = contains_metric.score("Hello, World!", "World")
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class Equals(base_metric.BaseMetric):
name: The name of the metric. Defaults to "equals_metric".
Example:
>>> from comet_llm_eval.evaluation.metrics import Equals
>>> from opik.evaluation.metrics import Equals
>>> equals_metric = Equals(case_sensitive=True)
>>> result = equals_metric.score("Hello, World!", "Hello, World!")
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class IsJson(base_metric.BaseMetric):
name: The name of the metric. Defaults to "is_json_metric".
Example:
>>> from comet_llm_eval.evaluation.metrics import IsJson
>>> from opik.evaluation.metrics import IsJson
>>> is_json_metric = IsJson()
>>> result = is_json_metric.score('{"key": "value"}')
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class LevenshteinRatio(base_metric.BaseMetric):
name: The name of the metric. Defaults to "levenshtein_ratio_metric".
Example:
>>> from comet_llm_eval.evaluation.metrics import LevenshteinRatio
>>> from opik.evaluation.metrics import LevenshteinRatio
>>> levenshtein_metric = LevenshteinRatio(case_sensitive=True)
>>> result = levenshtein_metric.score("Hello, World!", "Hello, World")
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class RegexMatch(base_metric.BaseMetric):
name: The name of the metric. Defaults to "regex_match_metric".
Example:
>>> from comet_llm_eval.evaluation.metrics import RegexMatch
>>> from opik.evaluation.metrics import RegexMatch
>>> regex_metric = RegexMatch(r"\d{3}-\d{2}-\d{4}")
>>> result = regex_metric.score("My SSN is 123-45-6789")
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class AnswerRelevance(base_metric.BaseMetric):
name: The name of the metric. Defaults to "AnswerRelevanceMetric".
Example:
>>> from comet_llm_eval.evaluation.metrics import AnswerRelevance
>>> from opik.evaluation.metrics import AnswerRelevance
>>> answer_relevance_metric = AnswerRelevance()
>>> result = answer_relevance_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."])
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ContextPrecision(base_metric.BaseMetric):
few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
Example:
>>> from comet_llm_eval.evaluation.metrics import ContextPrecision
>>> from opik.evaluation.metrics import ContextPrecision
>>> context_precision_metric = ContextPrecision()
>>> result = context_precision_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."])
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ContextRecall(base_metric.BaseMetric):
few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
Example:
>>> from comet_llm_eval.evaluation.metrics import ContextRecall
>>> from opik.evaluation.metrics import ContextRecall
>>> context_recall_metric = ContextRecall()
>>> result = context_recall_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."])
>>> print(result.value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class Factuality(base_metric.BaseMetric):
few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
Example:
>>> from comet_llm_eval.evaluation.metrics import Factuality
>>> from opik.evaluation.metrics import Factuality
>>> factuality_metric = Factuality()
>>> result = factuality_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."])
>>> print(result.value) # A float between 0.0 and 1.0
Expand Down
Empty file.
139 changes: 139 additions & 0 deletions sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import math
from functools import cached_property
from typing import Any, Optional, Union

from litellm.types.utils import ModelResponse

from opik.evaluation.metrics import base_metric, score_result
from opik.evaluation.models import base_model, models_factory
from opik.logging_messages import GEVAL_SCORE_CALC_FAILED
from .template import G_EVAL_COT_TEMPLATE, G_EVAL_QUERY_TEMPLATE
from ... import exceptions


class GEval(base_metric.BaseMetric):
def __init__(
self,
task_introduction: str,
evaluation_criteria: str,
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
name: str = "g_eval_metric",
):
super().__init__(
name=name,
)
self._init_model(model)

self.task_introduction = task_introduction
self.evaluation_criteria = evaluation_criteria

@cached_property
def llm_chain_of_thought(self) -> str:
prompt = G_EVAL_COT_TEMPLATE.format(
task_introduction=self.task_introduction,
evaluation_criteria=self.evaluation_criteria,
)
return self._model.generate_string(input=prompt)

def _init_model(
self, model: Optional[Union[str, base_model.OpikBaseModel]]
) -> None:
if isinstance(model, base_model.OpikBaseModel):
self._model = model
else:
self._model = models_factory.get(
model_name=model,
must_support_arguments=["logprobs", "top_logprobs"],
# we do not use additional params here as we need to get LLM's "Chain Of Thought" first
# logprobs=True,
# top_logprobs=20,
# response_format=GEvalScoreFormat,
)

def score(
self,
input: str,
**ignored_kwargs: Any,
) -> score_result.ScoreResult:
llm_query = G_EVAL_QUERY_TEMPLATE.format(
task_introduction=self.task_introduction,
evaluation_criteria=self.evaluation_criteria,
chain_of_thought=self.llm_chain_of_thought,
input=input,
)

request = [
{
"content": llm_query,
"role": "user",
},
]

model_output = self._model.generate_provider_response(
messages=request,
logprobs=True,
top_logprobs=20,
)

return self._parse_model_output(model_output)

async def ascore(
self, input: str, **ignored_kwargs: Any
) -> score_result.ScoreResult:
llm_query = G_EVAL_QUERY_TEMPLATE.format(
task_introduction=self.task_introduction,
evaluation_criteria=self.evaluation_criteria,
chain_of_thought=self.llm_chain_of_thought,
input=input,
)

request = [
{
"content": llm_query,
"role": "user",
},
]

model_output = await self._model.agenerate_provider_response(
messages=request,
logprobs=True,
top_logprobs=20,
)

return self._parse_model_output(model_output)

def _parse_model_output(self, content: ModelResponse) -> score_result.ScoreResult:
try:
# original_score = content.choices[0].model_extra['logprobs']['content'][0]['token']
top_logprobs = content.choices[0].model_extra["logprobs"]["content"][0][
"top_logprobs"
]

linear_probs_sum = 0.0
weighted_score_sum = 0.0

for token_info in top_logprobs:
# if not a number
if not token_info["token"].isdecimal():
continue

score = int(token_info["token"])

# if score value not in scale
if not 0 <= score <= 10:
continue

log_prob = token_info["logprob"]
linear_prob = math.exp(log_prob)

linear_probs_sum += linear_prob
weighted_score_sum += linear_prob * score

final_score: float = weighted_score_sum / linear_probs_sum / 10

if not (0.0 <= final_score <= 1.0):
raise ValueError

return score_result.ScoreResult(name=self.name, value=final_score)
except Exception:
raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
G_EVAL_COT_TEMPLATE = """
*** TASK:
Based on the following task description and evaluation criteria,
generate a detailed Chain of Thought (CoT) that outlines the necessary Evaluation Steps
to assess the solution. The CoT should clarify the reasoning process for each step of evaluation.
*** INPUT:
TASK INTRODUCTION:
{task_introduction}
EVALUATION CRITERIA:
{evaluation_criteria}
FINAL SCORE:
IF THE USER'S SCALE IS DIFFERENT FROM THE 0 TO 10 RANGE, RECALCULATE THE VALUE USING THIS SCALE.
SCORE VALUE MUST BE AN INTEGER.
"""


G_EVAL_QUERY_TEMPLATE = """
*** TASK INTRODUCTION:
{task_introduction}
*** EVALUATION CRITERIA:
{evaluation_criteria}
{chain_of_thought}
*** INPUT:
{input}
*** OUTPUT:
NO TEXT, ONLY SCORE
"""
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Hallucination(base_metric.BaseMetric):
few_shot_examples: A list of few-shot examples to use for hallucination detection. If None, default examples will be used.
Example:
>>> from comet_llm_eval.evaluation.metrics import Hallucination
>>> from opik.evaluation.metrics import Hallucination
>>> hallucination_metric = Hallucination()
>>> result = hallucination_metric.score(
... input="What is the capital of France?",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Moderation(base_metric.BaseMetric):
few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
Example:
>>> from comet_llm_eval.evaluation.metrics import Moderation
>>> from opik.evaluation.metrics import Moderation
>>> moderation_metric = Moderation()
>>> result = moderation_metric.score("Hello", "Hello, how can I help you?")
>>> print(result.value) # A float between 0.0 and 1.0
Expand Down
2 changes: 1 addition & 1 deletion sdks/python/src/opik/evaluation/models/models_factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional, Any
from . import base_model, litellm_chat_model

DEFAULT_GPT_MODEL_NAME = "gpt-3.5-turbo"
DEFAULT_GPT_MODEL_NAME = "gpt-4o"


def get(model_name: Optional[str], **model_kwargs: Any) -> base_model.OpikBaseModel:
Expand Down
2 changes: 2 additions & 0 deletions sdks/python/src/opik/logging_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@

CONTEXT_RECALL_SCORE_CALC_FAILED = "Failed to calculate context recall score"

GEVAL_SCORE_CALC_FAILED = "Failed to calculate g-eval score"

CONTEXT_PRECISION_SCORE_CALC_FAILED = "Failed to calculate context precision score"

NESTED_SPAN_PROJECT_NAME_MISMATCH_WARNING_MESSAGE = (
Expand Down

0 comments on commit 18cb086

Please sign in to comment.