From 18cb086caab6ab751098752c8ce3d04366c98c97 Mon Sep 17 00:00:00 2001
From: Alexander Barannikov <32936723+japdubengsub@users.noreply.github.com>
Date: Mon, 28 Oct 2024 15:03:09 +0000
Subject: [PATCH] OPIK-224 [SDK] Create new LLM eval metric - G-eval (#402)

* OPIK-224 [SDK] Create new LLM eval metric - G-eval

* rename old project name in docstrings

* extend base opik model class

* add litellm model

* extend base opik model class

* update litellm model

* migrate to litellm in models factory (now langchain is optional)

* add support of model response format messages for langchain

* calc g-eval with probabilities

* post-merge fix

* add lazy initialization fo LLM's chain of thought

* use gpt-4o model by default
---
 .../src/opik/evaluation/metrics/__init__.py   |   2 +
 .../evaluation/metrics/heuristics/contains.py |   2 +-
 .../evaluation/metrics/heuristics/equals.py   |   2 +-
 .../evaluation/metrics/heuristics/is_json.py  |   2 +-
 .../metrics/heuristics/levenshtein_ratio.py   |   2 +-
 .../metrics/heuristics/regex_match.py         |   2 +-
 .../llm_judges/answer_relevance/metric.py     |   2 +-
 .../llm_judges/context_precision/metric.py    |   2 +-
 .../llm_judges/context_recall/metric.py       |   2 +-
 .../metrics/llm_judges/factuality/metric.py   |   2 +-
 .../metrics/llm_judges/g_eval/__init__.py     |   0
 .../metrics/llm_judges/g_eval/metric.py       | 139 ++++++++++++++++++
 .../metrics/llm_judges/g_eval/template.py     |  36 +++++
 .../llm_judges/hallucination/metric.py        |   2 +-
 .../metrics/llm_judges/moderation/metric.py   |   2 +-
 .../opik/evaluation/models/models_factory.py  |   2 +-
 sdks/python/src/opik/logging_messages.py      |   2 +
 17 files changed, 191 insertions(+), 12 deletions(-)
 create mode 100644 sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py
 create mode 100644 sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
 create mode 100644 sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py

diff --git a/sdks/python/src/opik/evaluation/metrics/__init__.py b/sdks/python/src/opik/evaluation/metrics/__init__.py
index d98640f174..83aeff56ea 100644
--- a/sdks/python/src/opik/evaluation/metrics/__init__.py
+++ b/sdks/python/src/opik/evaluation/metrics/__init__.py
@@ -6,6 +6,7 @@
 from .llm_judges.answer_relevance.metric import AnswerRelevance
 from .llm_judges.context_precision.metric import ContextPrecision
 from .llm_judges.context_recall.metric import ContextRecall
+from .llm_judges.g_eval.metric import GEval
 from .llm_judges.hallucination.metric import Hallucination
 from .llm_judges.moderation.metric import Moderation
 from .base_metric import BaseMetric
@@ -20,6 +21,7 @@
     "ContextRecall",
     "Equals",
     # "Factuality",
+    "GEval",
     "Hallucination",
     "IsJson",
     "LevenshteinRatio",
diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py b/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py
index 19ca93a57c..e427a0c711 100644
--- a/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py
+++ b/sdks/python/src/opik/evaluation/metrics/heuristics/contains.py
@@ -15,7 +15,7 @@ class Contains(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "contains_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Contains
+        >>> from opik.evaluation.metrics import Contains
         >>> contains_metric = Contains(case_sensitive=True)
         >>> result = contains_metric.score("Hello, World!", "World")
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py b/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py
index 866b3c4ee0..bc94a2e51a 100644
--- a/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py
+++ b/sdks/python/src/opik/evaluation/metrics/heuristics/equals.py
@@ -15,7 +15,7 @@ class Equals(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "equals_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Equals
+        >>> from opik.evaluation.metrics import Equals
         >>> equals_metric = Equals(case_sensitive=True)
         >>> result = equals_metric.score("Hello, World!", "Hello, World!")
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py b/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py
index f26ad64b98..d095fefbf4 100644
--- a/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py
+++ b/sdks/python/src/opik/evaluation/metrics/heuristics/is_json.py
@@ -15,7 +15,7 @@ class IsJson(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "is_json_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import IsJson
+        >>> from opik.evaluation.metrics import IsJson
         >>> is_json_metric = IsJson()
         >>> result = is_json_metric.score('{"key": "value"}')
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py b/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py
index 9edee9fceb..bdd8a3f3aa 100644
--- a/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py
+++ b/sdks/python/src/opik/evaluation/metrics/heuristics/levenshtein_ratio.py
@@ -22,7 +22,7 @@ class LevenshteinRatio(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "levenshtein_ratio_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import LevenshteinRatio
+        >>> from opik.evaluation.metrics import LevenshteinRatio
         >>> levenshtein_metric = LevenshteinRatio(case_sensitive=True)
         >>> result = levenshtein_metric.score("Hello, World!", "Hello, World")
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py b/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py
index ab1e3277ee..94128e1339 100644
--- a/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py
+++ b/sdks/python/src/opik/evaluation/metrics/heuristics/regex_match.py
@@ -16,7 +16,7 @@ class RegexMatch(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "regex_match_metric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import RegexMatch
+        >>> from opik.evaluation.metrics import RegexMatch
         >>> regex_metric = RegexMatch(r"\d{3}-\d{2}-\d{4}")
         >>> result = regex_metric.score("My SSN is 123-45-6789")
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
index d86e23dbd5..6a4588a538 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/answer_relevance/metric.py
@@ -25,7 +25,7 @@ class AnswerRelevance(base_metric.BaseMetric):
         name: The name of the metric. Defaults to "AnswerRelevanceMetric".
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import AnswerRelevance
+        >>> from opik.evaluation.metrics import AnswerRelevance
         >>> answer_relevance_metric = AnswerRelevance()
         >>> result = answer_relevance_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."])
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
index edc42190b2..69893527bf 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_precision/metric.py
@@ -26,7 +26,7 @@ class ContextPrecision(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import ContextPrecision
+        >>> from opik.evaluation.metrics import ContextPrecision
         >>> context_precision_metric = ContextPrecision()
         >>> result = context_precision_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."])
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
index 49fd29831f..b17feba2e5 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/context_recall/metric.py
@@ -26,7 +26,7 @@ class ContextRecall(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to provide to the model. If None, uses the default few-shot examples.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import ContextRecall
+        >>> from opik.evaluation.metrics import ContextRecall
         >>> context_recall_metric = ContextRecall()
         >>> result = context_recall_metric.score("What's the capital of France?", "The capital of France is Paris.", "Paris", ["France is a country in Europe."])
         >>> print(result.value)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
index 3112207015..32c22ee43d 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/factuality/metric.py
@@ -27,7 +27,7 @@ class Factuality(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Factuality
+        >>> from opik.evaluation.metrics import Factuality
         >>> factuality_metric = Factuality()
         >>> result = factuality_metric.score("What's the capital of France?", "The capital of France is Paris.", ["France is a country in Europe."])
         >>> print(result.value)  # A float between 0.0 and 1.0
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
new file mode 100644
index 0000000000..c78bcbb4c8
--- /dev/null
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/metric.py
@@ -0,0 +1,139 @@
+import math
+from functools import cached_property
+from typing import Any, Optional, Union
+
+from litellm.types.utils import ModelResponse
+
+from opik.evaluation.metrics import base_metric, score_result
+from opik.evaluation.models import base_model, models_factory
+from opik.logging_messages import GEVAL_SCORE_CALC_FAILED
+from .template import G_EVAL_COT_TEMPLATE, G_EVAL_QUERY_TEMPLATE
+from ... import exceptions
+
+
+class GEval(base_metric.BaseMetric):
+    def __init__(
+        self,
+        task_introduction: str,
+        evaluation_criteria: str,
+        model: Optional[Union[str, base_model.OpikBaseModel]] = None,
+        name: str = "g_eval_metric",
+    ):
+        super().__init__(
+            name=name,
+        )
+        self._init_model(model)
+
+        self.task_introduction = task_introduction
+        self.evaluation_criteria = evaluation_criteria
+
+    @cached_property
+    def llm_chain_of_thought(self) -> str:
+        prompt = G_EVAL_COT_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+        )
+        return self._model.generate_string(input=prompt)
+
+    def _init_model(
+        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+    ) -> None:
+        if isinstance(model, base_model.OpikBaseModel):
+            self._model = model
+        else:
+            self._model = models_factory.get(
+                model_name=model,
+                must_support_arguments=["logprobs", "top_logprobs"],
+                # we do not use additional params here as we need to get LLM's "Chain Of Thought" first
+                # logprobs=True,
+                # top_logprobs=20,
+                # response_format=GEvalScoreFormat,
+            )
+
+    def score(
+        self,
+        input: str,
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        llm_query = G_EVAL_QUERY_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+            chain_of_thought=self.llm_chain_of_thought,
+            input=input,
+        )
+
+        request = [
+            {
+                "content": llm_query,
+                "role": "user",
+            },
+        ]
+
+        model_output = self._model.generate_provider_response(
+            messages=request,
+            logprobs=True,
+            top_logprobs=20,
+        )
+
+        return self._parse_model_output(model_output)
+
+    async def ascore(
+        self, input: str, **ignored_kwargs: Any
+    ) -> score_result.ScoreResult:
+        llm_query = G_EVAL_QUERY_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+            chain_of_thought=self.llm_chain_of_thought,
+            input=input,
+        )
+
+        request = [
+            {
+                "content": llm_query,
+                "role": "user",
+            },
+        ]
+
+        model_output = await self._model.agenerate_provider_response(
+            messages=request,
+            logprobs=True,
+            top_logprobs=20,
+        )
+
+        return self._parse_model_output(model_output)
+
+    def _parse_model_output(self, content: ModelResponse) -> score_result.ScoreResult:
+        try:
+            # original_score = content.choices[0].model_extra['logprobs']['content'][0]['token']
+            top_logprobs = content.choices[0].model_extra["logprobs"]["content"][0][
+                "top_logprobs"
+            ]
+
+            linear_probs_sum = 0.0
+            weighted_score_sum = 0.0
+
+            for token_info in top_logprobs:
+                # if not a number
+                if not token_info["token"].isdecimal():
+                    continue
+
+                score = int(token_info["token"])
+
+                # if score value not in scale
+                if not 0 <= score <= 10:
+                    continue
+
+                log_prob = token_info["logprob"]
+                linear_prob = math.exp(log_prob)
+
+                linear_probs_sum += linear_prob
+                weighted_score_sum += linear_prob * score
+
+            final_score: float = weighted_score_sum / linear_probs_sum / 10
+
+            if not (0.0 <= final_score <= 1.0):
+                raise ValueError
+
+            return score_result.ScoreResult(name=self.name, value=final_score)
+        except Exception:
+            raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED)
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py
new file mode 100644
index 0000000000..a67282c178
--- /dev/null
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/g_eval/template.py
@@ -0,0 +1,36 @@
+G_EVAL_COT_TEMPLATE = """
+*** TASK:
+Based on the following task description and evaluation criteria,
+generate a detailed Chain of Thought (CoT) that outlines the necessary Evaluation Steps
+to assess the solution. The CoT should clarify the reasoning process for each step of evaluation.
+
+*** INPUT:
+
+TASK INTRODUCTION:
+{task_introduction}
+
+EVALUATION CRITERIA:
+{evaluation_criteria}
+
+FINAL SCORE:
+IF THE USER'S SCALE IS DIFFERENT FROM THE 0 TO 10 RANGE, RECALCULATE THE VALUE USING THIS SCALE.
+SCORE VALUE MUST BE AN INTEGER.
+
+"""
+
+
+G_EVAL_QUERY_TEMPLATE = """
+*** TASK INTRODUCTION:
+{task_introduction}
+
+*** EVALUATION CRITERIA:
+{evaluation_criteria}
+
+{chain_of_thought}
+
+*** INPUT:
+{input}
+
+*** OUTPUT:
+NO TEXT, ONLY SCORE
+"""
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
index 71272bc4c1..1d61d919d1 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/hallucination/metric.py
@@ -25,7 +25,7 @@ class Hallucination(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to use for hallucination detection.  If None, default examples will be used.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Hallucination
+        >>> from opik.evaluation.metrics import Hallucination
         >>> hallucination_metric = Hallucination()
         >>> result = hallucination_metric.score(
         ...     input="What is the capital of France?",
diff --git a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
index e4878009bb..7ea6e66d82 100644
--- a/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
+++ b/sdks/python/src/opik/evaluation/metrics/llm_judges/moderation/metric.py
@@ -24,7 +24,7 @@ class Moderation(base_metric.BaseMetric):
         few_shot_examples: A list of few-shot examples to be used in the query. If None, default examples will be used.
 
     Example:
-        >>> from comet_llm_eval.evaluation.metrics import Moderation
+        >>> from opik.evaluation.metrics import Moderation
         >>> moderation_metric = Moderation()
         >>> result = moderation_metric.score("Hello", "Hello, how can I help you?")
         >>> print(result.value)  # A float between 0.0 and 1.0
diff --git a/sdks/python/src/opik/evaluation/models/models_factory.py b/sdks/python/src/opik/evaluation/models/models_factory.py
index f03b6711a3..06c787bc20 100644
--- a/sdks/python/src/opik/evaluation/models/models_factory.py
+++ b/sdks/python/src/opik/evaluation/models/models_factory.py
@@ -1,7 +1,7 @@
 from typing import Optional, Any
 from . import base_model, litellm_chat_model
 
-DEFAULT_GPT_MODEL_NAME = "gpt-3.5-turbo"
+DEFAULT_GPT_MODEL_NAME = "gpt-4o"
 
 
 def get(model_name: Optional[str], **model_kwargs: Any) -> base_model.OpikBaseModel:
diff --git a/sdks/python/src/opik/logging_messages.py b/sdks/python/src/opik/logging_messages.py
index 974457d542..c7f6736200 100644
--- a/sdks/python/src/opik/logging_messages.py
+++ b/sdks/python/src/opik/logging_messages.py
@@ -36,6 +36,8 @@
 
 CONTEXT_RECALL_SCORE_CALC_FAILED = "Failed to calculate context recall score"
 
+GEVAL_SCORE_CALC_FAILED = "Failed to calculate g-eval score"
+
 CONTEXT_PRECISION_SCORE_CALC_FAILED = "Failed to calculate context precision score"
 
 NESTED_SPAN_PROJECT_NAME_MISMATCH_WARNING_MESSAGE = (