diff --git a/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py b/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py deleted file mode 100644 index d8e1a44cb4..0000000000 --- a/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py +++ /dev/null @@ -1,129 +0,0 @@ -from unitxt import add_to_catalog -from unitxt.inference import CrossProviderInferenceEngine -from unitxt.llm_as_judge import LLMJudgeDirect -from unitxt.llm_as_judge_constants import ( - CriteriaWithOptions, -) - -option_map = { - "Excellent": 1.0, - "Good": 0.75, - "mediocre": 0.5, - "Bad": 0.25, - "Very Bad": 0, -} - -# First, describe a judgement criteria -adherence_criteria = CriteriaWithOptions.from_obj( - { - "name": "adherence_with_format", - "description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).", - "options": [ - { - "name": "Excellent", - "description": "The response perfectly aligns with the requested structure, style, or format, with no deviations.", - }, - { - "name": "Good", - "description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability.", - }, - { - "name": "mediocre", - "description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present.", - }, - { - "name": "Bad", - "description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence.", - }, - { - "name": "Very Bad", - "description": "The response fails to align with the requested structure, style, or format.", - }, - ], - "option_map": option_map, - } -) -add_to_catalog( - adherence_criteria, - f"metrics.llm_as_judge.direct.criteria.{adherence_criteria.name}", - overwrite=True, -) - -completeness_criteria = CriteriaWithOptions.from_obj( - { - "name": "answer_completeness", - "description": "The response is complete: all the aspects of the reference answer are addressed in the response. The " - "response might use different phrasing or wording from the reference answer.", - "options": [ - { - "name": "Excellent", - "description": "The response addresses all aspects of the reference answer.", - }, - { - "name": "Good", - "description": "The response addresses most aspects of the reference answer, with minor omissions.", - }, - { - "name": "mediocre", - "description": "The response covers the essential aspects of the reference answer but has notable omissions.", - }, - { - "name": "Bad", - "description": "The response covers only a few aspects of the reference answer, with significant omissions.", - }, - { - "name": "Very Bad", - "description": "The response fails to address the reference answer meaningfully, with most aspects omitted.", - }, - ], - "option_map": option_map, - } -) -add_to_catalog( - completeness_criteria, - f"metrics.llm_as_judge.direct.criteria.{completeness_criteria.name}", - overwrite=True, -) - - -# now = define the judge metric using the criteria -adherence_metric = LLMJudgeDirect( - inference_engine=CrossProviderInferenceEngine( # or your favorite inference model - model="llama-3-3-70b-instruct", max_tokens=1024, temperature=0, provider="watsonx" - ), - criteria=adherence_criteria, - # the fields from the generation task to be presented to the judge. Those fields must be present - # in the generation task so they can be embedded here - context_fields={ - "question": "question", - "instructions": "metadata/template/instruction", - }, - criteria_field="criteria", - generate_summaries=False, - check_positional_bias=False, -) -add_to_catalog( - adherence_metric, - "metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge", - overwrite=True, -) - -# now = define the judge metric using the criteria -completeness_metric = LLMJudgeDirect( - inference_engine=CrossProviderInferenceEngine( # or your favorite inference model - model="llama-3-3-70b-instruct", max_tokens=1024, temperature=0 - ), - criteria=completeness_criteria, - # the fields from the generation task to be presented to the judge. Those fields must be present - # in the generation task so they can be embedded here - context_fields={"question": "question", "reference_answers": "reference_answers"}, - criteria_field="criteria", - generate_summaries=False, - check_positional_bias=False, -) - -add_to_catalog( - completeness_metric, - "metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge", - overwrite=True, -) diff --git a/prepare/metrics/llm_as_judge/llm_as_judge.py b/prepare/metrics/llm_as_judge/llm_as_judge.py index 9a2e3c02dd..9cffa4621b 100644 --- a/prepare/metrics/llm_as_judge/llm_as_judge.py +++ b/prepare/metrics/llm_as_judge/llm_as_judge.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Optional, Union from unitxt import add_to_catalog, get_logger from unitxt.inference import CrossProviderInferenceEngine @@ -8,6 +8,7 @@ EVALUATOR_TO_MODEL_ID, EVALUATORS_METADATA, PAIRWISE_CRITERIA, + EvaluatorMetadata, EvaluatorNameEnum, EvaluatorTypeEnum, ModelProviderEnum, @@ -16,17 +17,24 @@ logger = get_logger() - def get_evaluator( name: EvaluatorNameEnum, evaluator_type: EvaluatorTypeEnum, - provider: ModelProviderEnum, + provider: Optional[ModelProviderEnum] = None, + evaluator_params: Optional[dict] = None, ) -> Union[LLMJudgeDirect, LLMJudgePairwise]: evaluator_metadata = get_evaluator_metadata(name) - inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0, "provider": provider.value} + inference_params = { + "max_tokens": 1024, + "seed": 42, + "temperature": 0, + } + if provider is not None: + inference_params["provider"] = provider.value + model_name = EVALUATOR_TO_MODEL_ID[name] - if provider == ModelProviderEnum.AZURE_OPENAI: + if provider is not None and provider == ModelProviderEnum.AZURE_OPENAI: inference_params["credentials"] = {} inference_params["credentials"]["api_base"] = ( f"https://eteopenai.azure-api.net/openai/deployments/{model_name}/chat/completions?api-version=2024-08-01-preview" @@ -42,6 +50,9 @@ def get_evaluator( "generate_summaries": False, } + if evaluator_params is not None: + params.update(evaluator_params) + evaluator_klass = ( LLMJudgeDirect if evaluator_type == EvaluatorTypeEnum.DIRECT @@ -51,6 +62,28 @@ def get_evaluator( return evaluator_klass(**params) +def get_evaluator_catalog_name( + evaluator_metadata: EvaluatorMetadata, + provider: ModelProviderEnum, + prefix: str = "", +): + metric_name = ( + evaluator_metadata.name.value.lower() + .replace("-", "_") + .replace(".", "_") + .replace(" ", "_") + ) + provider_name = "" + # for backward compatibility, ideally we would use cross inference engines provider ids + if provider == ModelProviderEnum.AZURE_OPENAI: + provider_name = "azure_openai" + elif provider == ModelProviderEnum.OPENAI: + provider_name = "openai" + else: + provider_name = provider.value.lower() + return f"metrics.{prefix}.{provider_name}.{metric_name}" + + logger.debug("Registering criteria...") # Register all the predefined criterisa for criteria in DIRECT_CRITERIA: @@ -67,36 +100,53 @@ def get_evaluator( overwrite=True, ) -logger.debug("Registering evaluators...") + +logger.debug("Registering generic judges (no criterion is set)...") for evaluator_metadata in EVALUATORS_METADATA: for provider in evaluator_metadata.providers: for evaluator_type in [ EvaluatorTypeEnum.DIRECT, EvaluatorTypeEnum.PAIRWISE, ]: - evaluator = get_evaluator( - name=evaluator_metadata.name, - evaluator_type=evaluator_type, - provider=provider, - ) - - metric_name = ( - evaluator_metadata.name.value.lower() - .replace("-", "_") - .replace(".", "_") - .replace(" ", "_") - ) - provider_name = "" - # for backward compatibility, ideally we would use cross inference engines provider ids - if provider == ModelProviderEnum.AZURE_OPENAI: - provider_name = "azure_openai" - elif provider == ModelProviderEnum.OPENAI: - provider_name = "openai" - else: - provider_name = provider.value.lower() - add_to_catalog( - evaluator, - f"metrics.llm_as_judge.{evaluator_type.value}.{provider_name}.{metric_name}", + get_evaluator( + name=evaluator_metadata.name, + evaluator_type=evaluator_type, + provider=provider, + ), + get_evaluator_catalog_name(evaluator_metadata, provider, f"llm_as_judge.{evaluator_type.value}"), overwrite=True, ) + +logger.debug("Registering judges with a specific criterion...") +add_to_catalog( + get_evaluator( + name=EvaluatorNameEnum.LLAMA3_3_70B, + evaluator_type=EvaluatorTypeEnum.DIRECT, + # provider=ModelProviderEnum.WATSONX, + evaluator_params={ + "criteria": "metrics.llm_as_judge.direct.criteria.adherence_with_format", + "context_fields": { + "question": "question", + "instructions": "metadata/template/instruction", + }, + }, + ), + "metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge", + overwrite=True, +) + + +add_to_catalog( + get_evaluator( + name=EvaluatorNameEnum.LLAMA3_3_70B, + evaluator_type=EvaluatorTypeEnum.DIRECT, + # provider=ModelProviderEnum.WATSONX, + evaluator_params={ + "criteria": "metrics.llm_as_judge.direct.criteria.answer_completeness", + "context_fields": {"question": "question", "reference_answers": "reference_answers"}, + }, + ), + "metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge", + overwrite=True, +) diff --git a/src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json index 3c92fc074d..a29b6f4731 100644 --- a/src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json +++ b/src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json @@ -2,55 +2,16 @@ "__type__": "llm_judge_direct", "inference_engine": { "__type__": "cross_provider_inference_engine", - "model": "llama-3-3-70b-instruct", "max_tokens": 1024, + "seed": 42, "temperature": 0, - "provider": "watsonx" - }, - "criteria": { - "__type__": "criteria_with_options", - "name": "adherence_with_format", - "description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).", - "options": [ - { - "__type__": "criteria_option", - "name": "Excellent", - "description": "The response perfectly aligns with the requested structure, style, or format, with no deviations." - }, - { - "__type__": "criteria_option", - "name": "Good", - "description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability." - }, - { - "__type__": "criteria_option", - "name": "mediocre", - "description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present." - }, - { - "__type__": "criteria_option", - "name": "Bad", - "description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence." - }, - { - "__type__": "criteria_option", - "name": "Very Bad", - "description": "The response fails to align with the requested structure, style, or format." - } - ], - "option_map": { - "Excellent": 1.0, - "Good": 0.75, - "mediocre": 0.5, - "Bad": 0.25, - "Very Bad": 0 - } + "model": "llama-3-3-70b-instruct" }, + "evaluator_name": "LLAMA3_3_70B", + "generate_summaries": false, + "criteria": "metrics.llm_as_judge.direct.criteria.adherence_with_format", "context_fields": { "question": "question", "instructions": "metadata/template/instruction" - }, - "criteria_field": "criteria", - "generate_summaries": false, - "check_positional_bias": false + } } diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_completeness/llama_3_3_70b_instruct_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_completeness/llama_3_3_70b_instruct_judge.json index 03498fb68b..e99ed6696a 100644 --- a/src/unitxt/catalog/metrics/rag/response_generation/answer_completeness/llama_3_3_70b_instruct_judge.json +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_completeness/llama_3_3_70b_instruct_judge.json @@ -2,54 +2,16 @@ "__type__": "llm_judge_direct", "inference_engine": { "__type__": "cross_provider_inference_engine", - "model": "llama-3-3-70b-instruct", "max_tokens": 1024, - "temperature": 0 - }, - "criteria": { - "__type__": "criteria_with_options", - "name": "answer_completeness", - "description": "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.", - "options": [ - { - "__type__": "criteria_option", - "name": "Excellent", - "description": "The response addresses all aspects of the reference answer." - }, - { - "__type__": "criteria_option", - "name": "Good", - "description": "The response addresses most aspects of the reference answer, with minor omissions." - }, - { - "__type__": "criteria_option", - "name": "mediocre", - "description": "The response covers the essential aspects of the reference answer but has notable omissions." - }, - { - "__type__": "criteria_option", - "name": "Bad", - "description": "The response covers only a few aspects of the reference answer, with significant omissions." - }, - { - "__type__": "criteria_option", - "name": "Very Bad", - "description": "The response fails to address the reference answer meaningfully, with most aspects omitted." - } - ], - "option_map": { - "Excellent": 1.0, - "Good": 0.75, - "mediocre": 0.5, - "Bad": 0.25, - "Very Bad": 0 - } + "seed": 42, + "temperature": 0, + "model": "llama-3-3-70b-instruct" }, + "evaluator_name": "LLAMA3_3_70B", + "generate_summaries": false, + "criteria": "metrics.llm_as_judge.direct.criteria.answer_completeness", "context_fields": { "question": "question", "reference_answers": "reference_answers" - }, - "criteria_field": "criteria", - "generate_summaries": false, - "check_positional_bias": false + } } diff --git a/src/unitxt/llm_as_judge_constants.py b/src/unitxt/llm_as_judge_constants.py index 0dfd70638e..cb72cbf7c8 100644 --- a/src/unitxt/llm_as_judge_constants.py +++ b/src/unitxt/llm_as_judge_constants.py @@ -953,6 +953,74 @@ class DirectCriteriaCatalogEnum(Enum): }, ) + ADHERENCE_WITH_FORMAT = CriteriaWithOptions( + "adherence_with_format", + "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).", + [ + CriteriaOption( + "Excellent", + "The response perfectly aligns with the requested structure, style, or format, with no deviations.", + ), + CriteriaOption( + "Good", + "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability.", + ), + CriteriaOption( + "mediocre", + "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present.", + ), + CriteriaOption( + "Bad", + "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence.", + ), + CriteriaOption( + "Very Bad", + "The response fails to align with the requested structure, style, or format.", + ), + ], + { + "Excellent": 1.0, + "Good": 0.75, + "mediocre": 0.5, + "Bad": 0.25, + "Very Bad": 0, + }, + ) + + ANSWER_COMPLETENESS = CriteriaWithOptions( + "answer_completeness", + "The response is complete: all the aspects of the reference answer are addressed in the response. The response might use different phrasing or wording from the reference answer.", + [ + CriteriaOption( + "Excellent", + "The response addresses all aspects of the reference answer.", + ), + CriteriaOption( + "Good", + "The response addresses most aspects of the reference answer, with minor omissions.", + ), + CriteriaOption( + "mediocre", + "The response covers the essential aspects of the reference answer but has notable omissions.", + ), + CriteriaOption( + "Bad", + "The response covers only a few aspects of the reference answer, with significant omissions.", + ), + CriteriaOption( + "Very Bad", + "The response fails to address the reference answer meaningfully, with most aspects omitted.", + ), + ], + { + "Excellent": 1.0, + "Good": 0.75, + "mediocre": 0.5, + "Bad": 0.25, + "Very Bad": 0, + }, + ) + DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]