OPIK-224 [SDK] Create new LLM eval metric - G-eval (#402)

* OPIK-224 [SDK] Create new LLM eval metric - G-eval * rename old project name in docstrings * extend base opik model class * add litellm model * extend base opik model class * update litellm model * migrate to litellm in models factory (now langchain is optional) * add support of model response format messages for langchain * calc g-eval with probabilities * post-merge fix * add lazy initialization fo LLM's chain of thought * use gpt-4o model by default
comet-ml · Oct 28, 2024 · 18cb086 · 18cb086
1 parent 1f45f05
commit 18cb086
Show file tree

Hide file tree

Showing 17 changed files with 191 additions and 12 deletions.
diff --git a/sdks/python/src/opik/evaluation/metrics/__init__.py b/sdks/python/src/opik/evaluation/metrics/__init__.py
@@ -6,6 +6,7 @@
 from .llm_judges.answer_relevance.metric import AnswerRelevance
 from .llm_judges.context_precision.metric import ContextPrecision
 from .llm_judges.context_recall.metric import ContextRecall
+from .llm_judges.g_eval.metric import GEval
 from .llm_judges.hallucination.metric import Hallucination
 from .llm_judges.moderation.metric import Moderation
 from .base_metric import BaseMetric
@@ -20,6 +21,7 @@
     "ContextRecall",
     "Equals",
     # "Factuality",
+    "GEval",
     "Hallucination",
     "IsJson",
     "LevenshteinRatio",

diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py b/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py
@@ -15,7 +15,7 @@ class Contains(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "contains_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Contains
+        >>> from opik.evaluation.metrics import Contains
         >>> contains_metric = Contains(case_sensitive=True)
         >>> result = contains_metric.score("Hello, World!", "World")
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py b/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py
@@ -15,7 +15,7 @@ class Equals(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "equals_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Equals
+        >>> from opik.evaluation.metrics import Equals
         >>> equals_metric = Equals(case_sensitive=True)
         >>> result = equals_metric.score("Hello, World!", "Hello, World!")
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py b/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py
@@ -15,7 +15,7 @@ class IsJson(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "is_json_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import IsJson
+        >>> from opik.evaluation.metrics import IsJson
         >>> is_json_metric = IsJson()
         >>> result = is_json_metric.score('{"key": "value"}')
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py b/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py
@@ -22,7 +22,7 @@ class LevenshteinRatio(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "levenshtein_ratio_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import LevenshteinRatio
+        >>> from opik.evaluation.metrics import LevenshteinRatio
         >>> levenshtein_metric = LevenshteinRatio(case_sensitive=True)
         >>> result = levenshtein_metric.score("Hello, World!", "Hello, World")
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py b/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py
@@ -16,7 +16,7 @@ class RegexMatch(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "regex_match_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import RegexMatch
+        >>> from opik.evaluation.metrics import RegexMatch
         >>> regex_metric = RegexMatch(r"\d{3}-\d{2}-\d{4}")
         >>> result = regex_metric.score("My SSN is 123-45-6789")
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
@@ -25,7 +25,7 @@ class AnswerRelevance(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "AnswerRelevanceMetric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import AnswerRelevance
+        >>> from opik.evaluation.metrics import AnswerRelevance
         >>> answer_relevance_metric = AnswerRelevance()
         >>> result = answer_relevance_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."])
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
@@ -26,7 +26,7 @@ class ContextPrecision(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import ContextPrecision
+        >>> from opik.evaluation.metrics import ContextPrecision
         >>> context_precision_metric = ContextPrecision()
         >>> result = context_precision_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."])
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
@@ -26,7 +26,7 @@ class ContextRecall(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import ContextRecall
+        >>> from opik.evaluation.metrics import ContextRecall
         >>> context_recall_metric = ContextRecall()
         >>> result = context_recall_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."])
         >>> print(result.value)

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
@@ -27,7 +27,7 @@ class Factuality(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Factuality
+        >>> from opik.evaluation.metrics import Factuality
         >>> factuality_metric = Factuality()
         >>> result = factuality_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."])
         >>> print(result.value)  # A float between 0.0 and 1.0

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
@@ -0,0 +1,139 @@
+import math
+from functools import cached_property
+from typing import Any, Optional, Union
+
+from litellm.types.utils import ModelResponse
+
+from opik.evaluation.metrics import base_metric, score_result
+from opik.evaluation.models import base_model, models_factory
+from opik.logging_messages import GEVAL_SCORE_CALC_FAILED
+from .template import G_EVAL_COT_TEMPLATE, G_EVAL_QUERY_TEMPLATE
+from ... import exceptions
+
+
+class GEval(base_metric.BaseMetric):
+    def __init__(
+        self,
+        task_introduction: str,
+        evaluation_criteria: str,
+        model: Optional[Union[str, base_model.OpikBaseModel]] = None,
+        name: str = "g_eval_metric",
+    ):
+        super().__init__(
+            name=name,
+        )
+        self._init_model(model)
+
+        self.task_introduction = task_introduction
+        self.evaluation_criteria = evaluation_criteria
+
+    @cached_property
+    def llm_chain_of_thought(self) -> str:
+        prompt = G_EVAL_COT_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+        )
+        return self._model.generate_string(input=prompt)
+
+    def _init_model(
+        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+    ) -> None:
+        if isinstance(model, base_model.OpikBaseModel):
+            self._model = model
+        else:
+            self._model = models_factory.get(
+                model_name=model,
+                must_support_arguments=["logprobs", "top_logprobs"],
+                # we do not use additional params here as we need to get LLM's "Chain Of Thought" first
+                # logprobs=True,
+                # top_logprobs=20,
+                # response_format=GEvalScoreFormat,
+            )
+
+    def score(
+        self,
+        input: str,
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        llm_query = G_EVAL_QUERY_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+            chain_of_thought=self.llm_chain_of_thought,
+            input=input,
+        )
+
+        request = [
+            {
+                "content": llm_query,
+                "role": "user",
+            },
+        ]
+
+        model_output = self._model.generate_provider_response(
+            messages=request,
+            logprobs=True,
+            top_logprobs=20,
+        )
+
+        return self._parse_model_output(model_output)
+
+    async def ascore(
+        self, input: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        llm_query = G_EVAL_QUERY_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+            chain_of_thought=self.llm_chain_of_thought,
+            input=input,
+        )
+
+        request = [
+            {
+                "content": llm_query,
+                "role": "user",
+            },
+        ]
+
+        model_output = await self._model.agenerate_provider_response(
+            messages=request,
+            logprobs=True,
+            top_logprobs=20,
+        )
+
+        return self._parse_model_output(model_output)
+
+    def _parse_model_output(self, content: ModelResponse) -> score_result.ScoreResult:
+        try:
+            # original_score = content.choices[0].model_extra['logprobs']['content'][0]['token']
+            top_logprobs = content.choices[0].model_extra["logprobs"]["content"][0][
+                "top_logprobs"
+            ]
+
+            linear_probs_sum = 0.0
+            weighted_score_sum = 0.0
+
+            for token_info in top_logprobs:
+                # if not a number
+                if not token_info["token"].isdecimal():
+                    continue
+
+                score = int(token_info["token"])
+
+                # if score value not in scale
+                if not 0 <= score <= 10:
+                    continue
+
+                log_prob = token_info["logprob"]
+                linear_prob = math.exp(log_prob)
+
+                linear_probs_sum += linear_prob
+                weighted_score_sum += linear_prob * score
+
+            final_score: float = weighted_score_sum / linear_probs_sum / 10
+
+            if not (0.0 <= final_score <= 1.0):
+                raise ValueError
+
+            return score_result.ScoreResult(name=self.name, value=final_score)
+        except Exception:
+            raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py
@@ -0,0 +1,36 @@
+G_EVAL_COT_TEMPLATE = """
+*** TASK:
+Based on the following task description and evaluation criteria,
+generate a detailed Chain of Thought (CoT) that outlines the necessary Evaluation Steps
+to assess the solution. The CoT should clarify the reasoning process for each step of evaluation.
+
+*** INPUT:
+
+TASK INTRODUCTION:
+{task_introduction}
+
+EVALUATION CRITERIA:
+{evaluation_criteria}
+
+FINAL SCORE:
+IF THE USER'S SCALE IS DIFFERENT FROM THE 0 TO 10 RANGE, RECALCULATE THE VALUE USING THIS SCALE.
+SCORE VALUE MUST BE AN INTEGER.
+
+"""
+
+
+G_EVAL_QUERY_TEMPLATE = """
+*** TASK INTRODUCTION:
+{task_introduction}
+
+*** EVALUATION CRITERIA:
+{evaluation_criteria}
+
+{chain_of_thought}
+
+*** INPUT:
+{input}
+
+*** OUTPUT:
+NO TEXT, ONLY SCORE
+"""
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
@@ -25,7 +25,7 @@ class Hallucination(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to use for hallucination detection.  If None, default examples will be used.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Hallucination
+        >>> from opik.evaluation.metrics import Hallucination
         >>> hallucination_metric = Hallucination()
         >>> result = hallucination_metric.score(
         ...     input="What is the capital of France?",

diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
@@ -24,7 +24,7 @@ class Moderation(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Moderation
+        >>> from opik.evaluation.metrics import Moderation
         >>> moderation_metric = Moderation()
         >>> result = moderation_metric.score("Hello", "Hello, how can I help you?")
         >>> print(result.value)  # A float between 0.0 and 1.0

diff --git a/sdks/python/src/opik/evaluation/models/models_factory.py b/sdks/python/src/opik/evaluation/models/models_factory.py
@@ -1,7 +1,7 @@
 from typing import Optional, Any
 from . import base_model, litellm_chat_model
 
-DEFAULT_GPT_MODEL_NAME = "gpt-3.5-turbo"
+DEFAULT_GPT_MODEL_NAME = "gpt-4o"
 
 
 def get(model_name: Optional[str], **model_kwargs: Any) -> base_model.OpikBaseModel:

diff --git a/sdks/python/src/opik/logging_messages.py b/sdks/python/src/opik/logging_messages.py
@@ -36,6 +36,8 @@
 
 CONTEXT_RECALL_SCORE_CALC_FAILED = "Failed to calculate context recall score"
 
+GEVAL_SCORE_CALC_FAILED = "Failed to calculate g-eval score"
+
 CONTEXT_PRECISION_SCORE_CALC_FAILED = "Failed to calculate context precision score"
 
 NESTED_SPAN_PROJECT_NAME_MISMATCH_WARNING_MESSAGE = (