From 18cb086caab6ab751098752c8ce3d04366c98c97 Mon Sep 17 00:00:00 2001 From: Alexander Barannikov <32936723+japdubengsub@users.noreply.github.com> Date: Mon, 28 Oct 2024 15:03:09 +0000 Subject: [PATCH] OPIK-224 [SDK] Create new LLM eval metric - G-eval (#402) * OPIK-224 [SDK] Create new LLM eval metric - G-eval * rename old project name in docstrings * extend base opik model class * add litellm model * extend base opik model class * update litellm model * migrate to litellm in models factory (now langchain is optional) * add support of model response format messages for langchain * calc g-eval with probabilities * post-merge fix * add lazy initialization fo LLM's chain of thought * use gpt-4o model by default --- .../src/opik/evaluation/metrics/__init__.py | 2 + .../evaluation/metrics/heuristics/contains.py | 2 +- .../evaluation/metrics/heuristics/equals.py | 2 +- .../evaluation/metrics/heuristics/is_json.py | 2 +- .../metrics/heuristics/levenshtein_ratio.py | 2 +- .../metrics/heuristics/regex_match.py | 2 +- .../llm_judges/answer_relevance/metric.py | 2 +- .../llm_judges/context_precision/metric.py | 2 +- .../llm_judges/context_recall/metric.py | 2 +- .../metrics/llm_judges/factuality/metric.py | 2 +- .../metrics/llm_judges/g_eval/__init__.py | 0 .../metrics/llm_judges/g_eval/metric.py | 139 ++++++++++++++++++ .../metrics/llm_judges/g_eval/template.py | 36 +++++ .../llm_judges/hallucination/metric.py | 2 +- .../metrics/llm_judges/moderation/metric.py | 2 +- .../opik/evaluation/models/models_factory.py | 2 +- sdks/python/src/opik/logging_messages.py | 2 + 17 files changed, 191 insertions(+), 12 deletions(-) create mode 100644 sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py create mode 100644 sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py create mode 100644 sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py diff --git a/sdks/python/src/opik/evaluation/metrics/__init__.py b/sdks/python/src/opik/evaluation/metrics/__init__.py index d98640f174..83aeff56ea 100644 --- a/sdks/python/src/opik/evaluation/metrics/__init__.py +++ b/sdks/python/src/opik/evaluation/metrics/__init__.py @@ -6,6 +6,7 @@ from .llm_judges.answer_relevance.metric import AnswerRelevance from .llm_judges.context_precision.metric import ContextPrecision from .llm_judges.context_recall.metric import ContextRecall +from .llm_judges.g_eval.metric import GEval from .llm_judges.hallucination.metric import Hallucination from .llm_judges.moderation.metric import Moderation from .base_metric import BaseMetric @@ -20,6 +21,7 @@ "ContextRecall", "Equals", # "Factuality", + "GEval", "Hallucination", "IsJson", "LevenshteinRatio", diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py b/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py index 19ca93a57c..e427a0c711 100644 --- a/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py +++ b/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py @@ -15,7 +15,7 @@ class Contains(base_metric.BaseMetric): name: The name of the metric. Defaults to "contains_metric". Example: - >>> from comet_llm_eval.evaluation.metrics import Contains + >>> from opik.evaluation.metrics import Contains >>> contains_metric = Contains(case_sensitive=True) >>> result = contains_metric.score("Hello, World!", "World") >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py b/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py index 866b3c4ee0..bc94a2e51a 100644 --- a/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py +++ b/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py @@ -15,7 +15,7 @@ class Equals(base_metric.BaseMetric): name: The name of the metric. Defaults to "equals_metric". Example: - >>> from comet_llm_eval.evaluation.metrics import Equals + >>> from opik.evaluation.metrics import Equals >>> equals_metric = Equals(case_sensitive=True) >>> result = equals_metric.score("Hello, World!", "Hello, World!") >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py b/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py index f26ad64b98..d095fefbf4 100644 --- a/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py +++ b/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py @@ -15,7 +15,7 @@ class IsJson(base_metric.BaseMetric): name: The name of the metric. Defaults to "is_json_metric". Example: - >>> from comet_llm_eval.evaluation.metrics import IsJson + >>> from opik.evaluation.metrics import IsJson >>> is_json_metric = IsJson() >>> result = is_json_metric.score('{"key": "value"}') >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py b/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py index 9edee9fceb..bdd8a3f3aa 100644 --- a/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py +++ b/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py @@ -22,7 +22,7 @@ class LevenshteinRatio(base_metric.BaseMetric): name: The name of the metric. Defaults to "levenshtein_ratio_metric". Example: - >>> from comet_llm_eval.evaluation.metrics import LevenshteinRatio + >>> from opik.evaluation.metrics import LevenshteinRatio >>> levenshtein_metric = LevenshteinRatio(case_sensitive=True) >>> result = levenshtein_metric.score("Hello, World!", "Hello, World") >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py b/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py index ab1e3277ee..94128e1339 100644 --- a/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py +++ b/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py @@ -16,7 +16,7 @@ class RegexMatch(base_metric.BaseMetric): name: The name of the metric. Defaults to "regex_match_metric". Example: - >>> from comet_llm_eval.evaluation.metrics import RegexMatch + >>> from opik.evaluation.metrics import RegexMatch >>> regex_metric = RegexMatch(r"\d{3}-\d{2}-\d{4}") >>> result = regex_metric.score("My SSN is 123-45-6789") >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py index d86e23dbd5..6a4588a538 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py @@ -25,7 +25,7 @@ class AnswerRelevance(base_metric.BaseMetric): name: The name of the metric. Defaults to "AnswerRelevanceMetric". Example: - >>> from comet_llm_eval.evaluation.metrics import AnswerRelevance + >>> from opik.evaluation.metrics import AnswerRelevance >>> answer_relevance_metric = AnswerRelevance() >>> result = answer_relevance_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."]) >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py index edc42190b2..69893527bf 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py @@ -26,7 +26,7 @@ class ContextPrecision(base_metric.BaseMetric): few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples. Example: - >>> from comet_llm_eval.evaluation.metrics import ContextPrecision + >>> from opik.evaluation.metrics import ContextPrecision >>> context_precision_metric = ContextPrecision() >>> result = context_precision_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."]) >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py index 49fd29831f..b17feba2e5 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py @@ -26,7 +26,7 @@ class ContextRecall(base_metric.BaseMetric): few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples. Example: - >>> from comet_llm_eval.evaluation.metrics import ContextRecall + >>> from opik.evaluation.metrics import ContextRecall >>> context_recall_metric = ContextRecall() >>> result = context_recall_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."]) >>> print(result.value) diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py index 3112207015..32c22ee43d 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py @@ -27,7 +27,7 @@ class Factuality(base_metric.BaseMetric): few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used. Example: - >>> from comet_llm_eval.evaluation.metrics import Factuality + >>> from opik.evaluation.metrics import Factuality >>> factuality_metric = Factuality() >>> result = factuality_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."]) >>> print(result.value) # A float between 0.0 and 1.0 diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py new file mode 100644 index 0000000000..c78bcbb4c8 --- /dev/null +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py @@ -0,0 +1,139 @@ +import math +from functools import cached_property +from typing import Any, Optional, Union + +from litellm.types.utils import ModelResponse + +from opik.evaluation.metrics import base_metric, score_result +from opik.evaluation.models import base_model, models_factory +from opik.logging_messages import GEVAL_SCORE_CALC_FAILED +from .template import G_EVAL_COT_TEMPLATE, G_EVAL_QUERY_TEMPLATE +from ... import exceptions + + +class GEval(base_metric.BaseMetric): + def __init__( + self, + task_introduction: str, + evaluation_criteria: str, + model: Optional[Union[str, base_model.OpikBaseModel]] = None, + name: str = "g_eval_metric", + ): + super().__init__( + name=name, + ) + self._init_model(model) + + self.task_introduction = task_introduction + self.evaluation_criteria = evaluation_criteria + + @cached_property + def llm_chain_of_thought(self) -> str: + prompt = G_EVAL_COT_TEMPLATE.format( + task_introduction=self.task_introduction, + evaluation_criteria=self.evaluation_criteria, + ) + return self._model.generate_string(input=prompt) + + def _init_model( + self, model: Optional[Union[str, base_model.OpikBaseModel]] + ) -> None: + if isinstance(model, base_model.OpikBaseModel): + self._model = model + else: + self._model = models_factory.get( + model_name=model, + must_support_arguments=["logprobs", "top_logprobs"], + # we do not use additional params here as we need to get LLM's "Chain Of Thought" first + # logprobs=True, + # top_logprobs=20, + # response_format=GEvalScoreFormat, + ) + + def score( + self, + input: str, + **ignored_kwargs: Any, + ) -> score_result.ScoreResult: + llm_query = G_EVAL_QUERY_TEMPLATE.format( + task_introduction=self.task_introduction, + evaluation_criteria=self.evaluation_criteria, + chain_of_thought=self.llm_chain_of_thought, + input=input, + ) + + request = [ + { + "content": llm_query, + "role": "user", + }, + ] + + model_output = self._model.generate_provider_response( + messages=request, + logprobs=True, + top_logprobs=20, + ) + + return self._parse_model_output(model_output) + + async def ascore( + self, input: str, **ignored_kwargs: Any + ) -> score_result.ScoreResult: + llm_query = G_EVAL_QUERY_TEMPLATE.format( + task_introduction=self.task_introduction, + evaluation_criteria=self.evaluation_criteria, + chain_of_thought=self.llm_chain_of_thought, + input=input, + ) + + request = [ + { + "content": llm_query, + "role": "user", + }, + ] + + model_output = await self._model.agenerate_provider_response( + messages=request, + logprobs=True, + top_logprobs=20, + ) + + return self._parse_model_output(model_output) + + def _parse_model_output(self, content: ModelResponse) -> score_result.ScoreResult: + try: + # original_score = content.choices[0].model_extra['logprobs']['content'][0]['token'] + top_logprobs = content.choices[0].model_extra["logprobs"]["content"][0][ + "top_logprobs" + ] + + linear_probs_sum = 0.0 + weighted_score_sum = 0.0 + + for token_info in top_logprobs: + # if not a number + if not token_info["token"].isdecimal(): + continue + + score = int(token_info["token"]) + + # if score value not in scale + if not 0 <= score <= 10: + continue + + log_prob = token_info["logprob"] + linear_prob = math.exp(log_prob) + + linear_probs_sum += linear_prob + weighted_score_sum += linear_prob * score + + final_score: float = weighted_score_sum / linear_probs_sum / 10 + + if not (0.0 <= final_score <= 1.0): + raise ValueError + + return score_result.ScoreResult(name=self.name, value=final_score) + except Exception: + raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED) diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py new file mode 100644 index 0000000000..a67282c178 --- /dev/null +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py @@ -0,0 +1,36 @@ +G_EVAL_COT_TEMPLATE = """ +*** TASK: +Based on the following task description and evaluation criteria, +generate a detailed Chain of Thought (CoT) that outlines the necessary Evaluation Steps +to assess the solution. The CoT should clarify the reasoning process for each step of evaluation. + +*** INPUT: + +TASK INTRODUCTION: +{task_introduction} + +EVALUATION CRITERIA: +{evaluation_criteria} + +FINAL SCORE: +IF THE USER'S SCALE IS DIFFERENT FROM THE 0 TO 10 RANGE, RECALCULATE THE VALUE USING THIS SCALE. +SCORE VALUE MUST BE AN INTEGER. + +""" + + +G_EVAL_QUERY_TEMPLATE = """ +*** TASK INTRODUCTION: +{task_introduction} + +*** EVALUATION CRITERIA: +{evaluation_criteria} + +{chain_of_thought} + +*** INPUT: +{input} + +*** OUTPUT: +NO TEXT, ONLY SCORE +""" diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py index 71272bc4c1..1d61d919d1 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py @@ -25,7 +25,7 @@ class Hallucination(base_metric.BaseMetric): few_shot_examples: A list of few-shot examples to use for hallucination detection. If None, default examples will be used. Example: - >>> from comet_llm_eval.evaluation.metrics import Hallucination + >>> from opik.evaluation.metrics import Hallucination >>> hallucination_metric = Hallucination() >>> result = hallucination_metric.score( ... input="What is the capital of France?", diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py index e4878009bb..7ea6e66d82 100644 --- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py +++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py @@ -24,7 +24,7 @@ class Moderation(base_metric.BaseMetric): few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used. Example: - >>> from comet_llm_eval.evaluation.metrics import Moderation + >>> from opik.evaluation.metrics import Moderation >>> moderation_metric = Moderation() >>> result = moderation_metric.score("Hello", "Hello, how can I help you?") >>> print(result.value) # A float between 0.0 and 1.0 diff --git a/sdks/python/src/opik/evaluation/models/models_factory.py b/sdks/python/src/opik/evaluation/models/models_factory.py index f03b6711a3..06c787bc20 100644 --- a/sdks/python/src/opik/evaluation/models/models_factory.py +++ b/sdks/python/src/opik/evaluation/models/models_factory.py @@ -1,7 +1,7 @@ from typing import Optional, Any from . import base_model, litellm_chat_model -DEFAULT_GPT_MODEL_NAME = "gpt-3.5-turbo" +DEFAULT_GPT_MODEL_NAME = "gpt-4o" def get(model_name: Optional[str], **model_kwargs: Any) -> base_model.OpikBaseModel: diff --git a/sdks/python/src/opik/logging_messages.py b/sdks/python/src/opik/logging_messages.py index 974457d542..c7f6736200 100644 --- a/sdks/python/src/opik/logging_messages.py +++ b/sdks/python/src/opik/logging_messages.py @@ -36,6 +36,8 @@ CONTEXT_RECALL_SCORE_CALC_FAILED = "Failed to calculate context recall score" +GEVAL_SCORE_CALC_FAILED = "Failed to calculate g-eval score" + CONTEXT_PRECISION_SCORE_CALC_FAILED = "Failed to calculate context precision score" NESTED_SPAN_PROJECT_NAME_MISMATCH_WARNING_MESSAGE = (