From de868ab67cf1d67d293f8621f6f0ac6811462bef Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 12 Nov 2024 17:10:53 +0200
Subject: [PATCH 01/26] Add multi api inference engine

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 .../evaluate_benchmark_with_custom_api.py     |  30 ++++++
 prepare/engines/multi_api/llama3.py           |  16 +++
 .../engines/model/llama_3_8b_instruct.json    |  12 +++
 src/unitxt/inference.py                       | 100 +++++++++++++++---
 src/unitxt/settings_utils.py                  |   2 +
 src/unitxt/standard.py                        |  12 ++-
 6 files changed, 155 insertions(+), 17 deletions(-)
 create mode 100644 examples/evaluate_benchmark_with_custom_api.py
 create mode 100644 prepare/engines/multi_api/llama3.py
 create mode 100644 src/unitxt/catalog/engines/model/llama_3_8b_instruct.json

diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py
new file mode 100644
index 0000000000..5db2e418d8
--- /dev/null
+++ b/examples/evaluate_benchmark_with_custom_api.py
@@ -0,0 +1,30 @@
+import unitxt
+from unitxt import evaluate, get_from_catalog, load_dataset
+from unitxt.text_utils import print_dict
+
+with unitxt.settings.context(
+    default_inference_api="watsonx",  # option a to define your home api
+    default_format="formats.chat_api",
+    disable_hf_datasets_cache=False,
+):
+    data = load_dataset("benchmarks.glue[max_samples_per_subset=5]", split="test")
+
+    model = get_from_catalog(
+        "engines.model.llama_3_8b_instruct[api=watsonx]"
+    )  # option b to define your home api
+
+    predictions = model.infer(data)
+
+    evaluated_dataset = evaluate(predictions=predictions, data=data)
+
+    print_dict(
+        evaluated_dataset[0],
+        keys_to_print=[
+            "source",
+            "prediction",
+            "subset",
+        ],
+    )
+    print_dict(
+        evaluated_dataset[0]["score"]["subsets"],
+    )
diff --git a/prepare/engines/multi_api/llama3.py b/prepare/engines/multi_api/llama3.py
new file mode 100644
index 0000000000..8ebaa4adf2
--- /dev/null
+++ b/prepare/engines/multi_api/llama3.py
@@ -0,0 +1,16 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.inference import MultiAPIInferenceEngine
+
+engine = MultiAPIInferenceEngine(
+    model="llama-3-8b-instruct",
+    api_model_map={
+        "watsonx": {
+            "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
+        },
+        "together-ai": {
+            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct"
+        },
+    },
+)
+
+add_to_catalog(engine, "engines.model.llama_3_8b_instruct", overwrite=True)
diff --git a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
new file mode 100644
index 0000000000..a6c2be46c0
--- /dev/null
+++ b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
@@ -0,0 +1,12 @@
+{
+    "__type__": "multi_api_inference_engine",
+    "model": "llama-3-8b-instruct",
+    "api_model_map": {
+        "watsonx": {
+            "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct"
+        },
+        "together-ai": {
+            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct"
+        }
+    }
+}
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 7604fb3410..9f80b220df 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1121,9 +1121,9 @@ def _infer(
         model, params = self._load_model_and_params()
 
         result = []
-        for instance in dataset:
+        for source in dataset["source"]:
             instance_result = model.generate(
-                prompt=instance["source"],
+                prompt=source,
                 params=self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False),
             )
             prediction = instance_result["results"][0]["generated_text"]
@@ -1364,9 +1364,7 @@ class LMMSEvalBaseInferenceEngine(
     batch_size: int = 1
     image_token = "<image>"
 
-    _requirements_list = {
-        "lmms_eval": "Install llms-eval package using 'pip install lmms-eval==0.2.4'",
-    }
+    _requirements_list = ["lmms-eval==0.2.4"]
 
     def prepare_engine(self):
         if not self.lazy_load:
@@ -1413,6 +1411,7 @@ def _infer(
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        self.verify_not_chat_api(dataset)
         if not self._is_loaded():
             self._prepare_engine()
 
@@ -1562,12 +1561,26 @@ async def acquire(self, tokens=1):
             await asyncio.sleep(time_until_next_token)
 
 
-class LiteLLMInferenceEngine(InferenceEngine, PackageRequirementsMixin):
+class StandardAPIParamsMixin(Artifact):
     model: str
-    max_tokens: int = 256
-    seed: int = 1
-    temperature: float = 0.0
-    top_p: float = 1.0
+    frequency_penalty: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    max_tokens: Optional[int] = None
+    seed: Optional[int] = None
+    stop: Union[Optional[str], List[str]] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    top_logprobs: Optional[int] = 20
+    logit_bias: Optional[Dict[str, int]] = None
+    logprobs: Optional[bool] = True
+    n: Optional[int] = None
+    parallel_tool_calls: Optional[bool] = None
+    service_tier: Optional[Literal["auto", "default"]] = None
+
+
+class LiteLLMInferenceEngine(
+    InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin
+):
     max_requests_per_second: float = 6
     max_retries: int = 5  # Set to 0 to prevent internal retries
 
@@ -1599,15 +1612,12 @@ async def _infer_instance(
             # Introduce a slight delay to prevent burstiness
             await asyncio.sleep(0.01)
             messages = self.to_messages(instance)
+            kwargs = self.to_dict([StandardAPIParamsMixin])
             response = await self._completion(
-                model=self.model,
                 messages=messages,
-                seed=self.seed,
-                max_tokens=self.max_tokens,
-                temperature=self.temperature,
-                top_p=self.top_p,
                 max_retries=self.max_retries,
                 caching=True,
+                **kwargs,
             )
             usage = response.get("usage", {})
             return TextGenerationInferenceOutput(
@@ -1643,3 +1653,63 @@ def _infer(
             return responses
 
         return [response.prediction for response in responses]
+
+
+_supported_apis = Literal["watsonx", "together-ai", "open-ai"]
+
+
+class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
+    """Inference engine capable of dynamically switching between multiple APIs.
+
+    This class extends the InferenceEngine and OpenAiInferenceEngineParamsMixin
+    to enable seamless integration with various API providers. The supported APIs are
+    specified in `_supported_apis`, allowing users to interact with multiple models
+    from different sources. The `api_model_map` dictionary maps each API to
+    specific model identifiers, enabling automatic configuration based on
+    user requests.
+
+    Attributes:
+        api: Optional; Specifies the current API in use. Must be one of the
+            literals in `_supported_apis`.
+        api_model_map: Dictionary mapping each supported API to a corresponding
+            model identifier string. This mapping allows consistent access to models
+            across different API backends.
+    """
+
+    api: Optional[_supported_apis] = None
+
+    api_model_map: Dict[_supported_apis, Dict[str, str]] = {
+        "watsonx": {
+            "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
+        },
+        "together-ai": {
+            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct"
+        },
+    }
+
+    _api_to_base_class = {
+        "watsonx": LiteLLMInferenceEngine,
+        "open-ai": LiteLLMInferenceEngine,
+        "together-ai": LiteLLMInferenceEngine,
+    }
+
+    def get_api_name(self):
+        return self.api if self.api is not None else settings.default_inference_api
+
+    def prepare_engine(self):
+        api = self.get_api_name()
+        cls = self.__class__._api_to_base_class[api]
+        args = self.to_dict([OpenAiInferenceEngineParamsMixin])
+        args["model"] = self.api_model_map[api][self.model]
+        self.engine = cls(**args)
+
+    def _infer(
+        self,
+        dataset: List[Dict[str, Any]] | DatasetDict,
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return self.engine._infer(dataset, return_meta_data)
+
+    def get_engine_id(self):
+        api = self.get_api_name()
+        return get_model_and_label_id(self.api_model_map[api][self.model], api)
diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
index 6bde1718c6..47ec7faf7a 100644
--- a/src/unitxt/settings_utils.py
+++ b/src/unitxt/settings_utils.py
@@ -151,6 +151,8 @@ def __getattr__(self, key):
     settings.disable_hf_datasets_cache = (bool, True)
     settings.loader_cache_size = (int, 1)
     settings.task_data_as_text = (bool, True)
+    settings.default_inference_api = "watsonx"
+    settings.default_format = None
 
 if Constants.is_uninitilized():
     constants = Constants()
diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py
index b9989d82b0..ba2607e316 100644
--- a/src/unitxt/standard.py
+++ b/src/unitxt/standard.py
@@ -1,5 +1,6 @@
 from typing import List, Optional, Union
 
+from .artifact import fetch_artifact
 from .augmentors import (
     Augmentor,
     FinalStateInputsAugmentor,
@@ -16,7 +17,7 @@
 from .recipe import Recipe
 from .schema import FinalizeDataset
 from .serializers import SingleTypeSerializer
-from .settings_utils import get_constants
+from .settings_utils import get_constants, get_settings
 from .splitters import ConstantSizeSample, RandomSizeSample, Sampler, SeparateSplit
 from .stream import MultiStream
 from .system_prompts import EmptySystemPrompt, SystemPrompt
@@ -25,6 +26,7 @@
 from .utils import LRUCache
 
 constants = get_constants()
+settings = get_settings()
 logger = get_logger()
 
 
@@ -39,7 +41,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
     task: Task = None
     template: Union[Template, List[Template], TemplatesList] = None
     system_prompt: SystemPrompt = Field(default_factory=EmptySystemPrompt)
-    format: Format = Field(default_factory=SystemFormat)
+    format: Format = None
     serializer: Union[SingleTypeSerializer, List[SingleTypeSerializer]] = None
 
     # Additional parameters
@@ -263,6 +265,12 @@ def produce(self, task_instances):
         return list(multi_stream[constants.inference_stream])
 
     def reset_pipeline(self):
+        if self.format is None:
+            if settings.default_format is not None:
+                self.format, _ = fetch_artifact(settings.default_format)
+            else:
+                self.format = SystemFormat()
+
         if self.card and self.card.preprocess_steps is None:
             self.card.preprocess_steps = []
 

From c40d87dbb2eb98acb3d8c4262994abb75bfa8c96 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 12 Nov 2024 17:20:10 +0200
Subject: [PATCH 02/26] Fix

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 src/unitxt/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 9f80b220df..d9494482b3 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1705,7 +1705,7 @@ def prepare_engine(self):
 
     def _infer(
         self,
-        dataset: List[Dict[str, Any]] | DatasetDict,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         return self.engine._infer(dataset, return_meta_data)

From d53eb69007ddfd8dcef9a7bef77f91a4acf1f784 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 12 Nov 2024 17:24:10 +0200
Subject: [PATCH 03/26] Set to greedy decoding

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 examples/evaluate_benchmark_with_custom_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py
index 5db2e418d8..fd232cea03 100644
--- a/examples/evaluate_benchmark_with_custom_api.py
+++ b/examples/evaluate_benchmark_with_custom_api.py
@@ -10,7 +10,7 @@
     data = load_dataset("benchmarks.glue[max_samples_per_subset=5]", split="test")
 
     model = get_from_catalog(
-        "engines.model.llama_3_8b_instruct[api=watsonx]"
+        "engines.model.llama_3_8b_instruct[api=watsonx,top_k=1]"
     )  # option b to define your home api
 
     predictions = model.infer(data)

From b36b7ab8f56fd871a6a6fae1e8685d740b810bfe Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Sun, 17 Nov 2024 21:40:50 +0200
Subject: [PATCH 04/26] Some fixes

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 .../evaluate_benchmark_with_custom_api.py     | 45 ++++++------
 src/unitxt/benchmark.py                       | 25 +++++--
 src/unitxt/inference.py                       | 72 +++++++++++--------
 src/unitxt/standard.py                        |  3 +
 tests/library/test_benchmark.py               | 34 +++++++++
 5 files changed, 121 insertions(+), 58 deletions(-)

diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py
index fd232cea03..12e2bbc57b 100644
--- a/examples/evaluate_benchmark_with_custom_api.py
+++ b/examples/evaluate_benchmark_with_custom_api.py
@@ -1,30 +1,27 @@
-import unitxt
-from unitxt import evaluate, get_from_catalog, load_dataset
+from unitxt import evaluate, load_dataset
+from unitxt.inference import MultiAPIInferenceEngine
 from unitxt.text_utils import print_dict
 
-with unitxt.settings.context(
-    default_inference_api="watsonx",  # option a to define your home api
-    default_format="formats.chat_api",
-    disable_hf_datasets_cache=False,
-):
-    data = load_dataset("benchmarks.glue[max_samples_per_subset=5]", split="test")
+data = load_dataset(
+    "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api]",
+    split="test",
+    disable_cache=False,
+)
 
-    model = get_from_catalog(
-        "engines.model.llama_3_8b_instruct[api=watsonx,top_k=1]"
-    )  # option b to define your home api
+model = MultiAPIInferenceEngine(model="llama-3-8b-instruct", top_k=1, api="watsonx")
 
-    predictions = model.infer(data)
+predictions = model.infer(data)
 
-    evaluated_dataset = evaluate(predictions=predictions, data=data)
+evaluated_dataset = evaluate(predictions=predictions, data=data)
 
-    print_dict(
-        evaluated_dataset[0],
-        keys_to_print=[
-            "source",
-            "prediction",
-            "subset",
-        ],
-    )
-    print_dict(
-        evaluated_dataset[0]["score"]["subsets"],
-    )
+print_dict(
+    evaluated_dataset[0],
+    keys_to_print=[
+        "source",
+        "prediction",
+        "subset",
+    ],
+)
+print_dict(
+    evaluated_dataset[0]["score"]["subsets"],
+)
diff --git a/src/unitxt/benchmark.py b/src/unitxt/benchmark.py
index 33f65d0115..7678dc175b 100644
--- a/src/unitxt/benchmark.py
+++ b/src/unitxt/benchmark.py
@@ -1,3 +1,4 @@
+from abc import abstractmethod
 from typing import Dict, Union
 
 from .dataclass import NonPositionalField
@@ -15,6 +16,10 @@ class BaseBenchmark(SourceOperator):
     system_prompt: SystemPrompt = NonPositionalField(default=None)
     loader_limit: int = NonPositionalField(default=None)
 
+    @abstractmethod
+    def reset(self):
+        pass
+
 
 class Benchmark(BaseBenchmark):
     subsets: Dict[str, Union[StandardRecipe, BaseBenchmark]]
@@ -23,16 +28,20 @@ class Benchmark(BaseBenchmark):
     max_samples_per_subset: int = None
 
     def verify(self):
+        super().verify()
         if (
             self.max_total_samples is not None
             and self.max_samples_per_subset is not None
         ):
             raise ValueError("Set either max_total_samples or max_samples_per_subset")
 
-    def prepare(self):
-        for subset in self.subsets.values():
-            subset.loader_limit = self.loader_limit
-        if self.format is not None or self.num_demos is not None:
+    def reset(self):
+        if (
+            self.format is not None
+            or self.num_demos is not None
+            or self.system_prompt is not None
+            or self.loader_limit is not None
+        ):
             for subset in self.subsets.values():
                 if self.num_demos is not None:
                     subset.num_demos = self.num_demos
@@ -42,7 +51,13 @@ def prepare(self):
                     subset.system_prompt = self.system_prompt
                 if self.loader_limit is not None:
                     subset.loader_limit = self.loader_limit
-                subset.prepare()
+
+                subset.reset()
+
+    def prepare(self):
+        super().prepare()
+
+        self.reset()
 
     def process(
         self,
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index b87116ca77..02085327e4 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -28,6 +28,24 @@
 logger = get_logger()
 
 
+class StandardAPIParamsMixin(Artifact):
+    model: str
+    frequency_penalty: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    max_tokens: Optional[int] = None
+    seed: Optional[int] = None
+    stop: Union[Optional[str], List[str]] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    top_k: Optional[int] = None
+    top_logprobs: Optional[int] = 20
+    logit_bias: Optional[Dict[str, int]] = None
+    logprobs: Optional[bool] = True
+    n: Optional[int] = None
+    parallel_tool_calls: Optional[bool] = None
+    service_tier: Optional[Literal["auto", "default"]] = None
+
+
 def get_model_and_label_id(model_name, label):
     model_id = model_name.split("/")[-1].replace("-", "_").replace(".", ",").lower()
     return f"{model_id}_{label}"
@@ -372,16 +390,17 @@ def _infer(
         return self.engine._infer(dataset)
 
 
-class OllamaInferenceEngine(InferenceEngine, PackageRequirementsMixin):
+class OllamaInferenceEngine(
+    InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin
+):
     label: str = "ollama"
-    model_name: str
     _requirements_list = {
         "ollama": "Install ollama package using 'pip install --upgrade ollama"
     }
     data_classification_policy = ["public", "proprietary"]
 
     def get_engine_id(self):
-        return get_model_and_label_id(self.model_name, self.label)
+        return get_model_and_label_id(self.model, self.label)
 
     def prepare_engine(self):
         pass
@@ -393,13 +412,16 @@ def _infer(
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
         import ollama
 
+        args = self.to_dict([StandardAPIParamsMixin])
+
         results = []
 
         for instance in dataset:
             messages = self.to_messages(instance)
             response = ollama.chat(
-                model=self.model_name,
+                model=self.model,
                 messages=messages,
+                **args,
             )
             results.append(response)
 
@@ -1562,23 +1584,6 @@ async def acquire(self, tokens=1):
             await asyncio.sleep(time_until_next_token)
 
 
-class StandardAPIParamsMixin(Artifact):
-    model: str
-    frequency_penalty: Optional[float] = None
-    presence_penalty: Optional[float] = None
-    max_tokens: Optional[int] = None
-    seed: Optional[int] = None
-    stop: Union[Optional[str], List[str]] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    top_logprobs: Optional[int] = 20
-    logit_bias: Optional[Dict[str, int]] = None
-    logprobs: Optional[bool] = True
-    n: Optional[int] = None
-    parallel_tool_calls: Optional[bool] = None
-    service_tier: Optional[Literal["auto", "default"]] = None
-
-
 class LiteLLMInferenceEngine(
     InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin
 ):
@@ -1616,7 +1621,6 @@ async def _infer_instance(
             kwargs = self.to_dict([StandardAPIParamsMixin])
             try:
                 response = await self._completion(
-                    model=self.model,
                     messages=messages,
                     max_retries=self.max_retries,
                     caching=True,
@@ -1663,8 +1667,7 @@ def _infer(
         return [response.prediction for response in responses]
 
 
-
-_supported_apis = Literal["watsonx", "together-ai", "open-ai"]
+_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama"]
 
 
 class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
@@ -1690,9 +1693,19 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     api_model_map: Dict[_supported_apis, Dict[str, str]] = {
         "watsonx": {
             "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
+            "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct",
         },
         "together-ai": {
-            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct"
+            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct",
+            "llama-3-70b-instruct": "together_ai/togethercomputer/llama-3-70b-instruct",
+        },
+        "aws": {
+            "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
+            "llama-3-70b-instruct": "bedrock/meta.llama3-70b-instruct-v1:0",
+        },
+        "ollama": {
+            "llama-3-8b-instruct": "llama3:8b",
+            "llama-3-70b-instruct": "llama3:70b",
         },
     }
 
@@ -1700,6 +1713,8 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "watsonx": LiteLLMInferenceEngine,
         "open-ai": LiteLLMInferenceEngine,
         "together-ai": LiteLLMInferenceEngine,
+        "aws": LiteLLMInferenceEngine,
+        "ollama": OllamaInferenceEngine,
     }
 
     def get_api_name(self):
@@ -1708,10 +1723,10 @@ def get_api_name(self):
     def prepare_engine(self):
         api = self.get_api_name()
         cls = self.__class__._api_to_base_class[api]
-        args = self.to_dict([OpenAiInferenceEngineParamsMixin])
+        args = self.to_dict([StandardAPIParamsMixin])
         args["model"] = self.api_model_map[api][self.model]
         self.engine = cls(**args)
-    
+
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
@@ -1723,6 +1738,7 @@ def get_engine_id(self):
         api = self.get_api_name()
         return get_model_and_label_id(self.api_model_map[api][self.model], api)
 
+
 class HFOptionSelectingInferenceEngine(InferenceEngine):
     """HuggingFace based class for inference engines that calculate log probabilities.
 
@@ -1797,13 +1813,11 @@ def get_log_probs(self, texts):
 
         return log_probs
 
-
     def _infer(
         self,
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-      
         inputs = []
 
         for instance in dataset:
diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py
index 5a3c6017f2..9f0d8add62 100644
--- a/src/unitxt/standard.py
+++ b/src/unitxt/standard.py
@@ -264,6 +264,9 @@ def produce(self, task_instances):
         multi_stream = self.inference(multi_stream)
         return list(multi_stream[constants.inference_stream])
 
+    def reset(self):
+        self.reset_pipeline()
+
     def reset_pipeline(self):
         if self.format is None:
             if settings.default_format is not None:
diff --git a/tests/library/test_benchmark.py b/tests/library/test_benchmark.py
index a4efb82adf..21579d97eb 100644
--- a/tests/library/test_benchmark.py
+++ b/tests/library/test_benchmark.py
@@ -97,3 +97,37 @@ def test_benchmark(self):
                 },
             ],
         )
+
+    def test_benchmark_format_trickling(self):
+        benchmark = Benchmark(
+            format="formats.chat_api",
+            max_samples_per_subset=2,
+            loader_limit=30,
+            subsets={
+                "cola": Benchmark(
+                    format="formats.user_agent",
+                    max_samples_per_subset=1,
+                    loader_limit=300,
+                    subsets={
+                        "cola": StandardRecipe(
+                            card="cards.cola",
+                            template="templates.classification.multi_class.instruction",
+                        ),
+                        "wnli": StandardRecipe(
+                            card="cards.wnli",
+                            format="formats.empty",
+                            template="templates.classification.multi_class.relation.default",
+                        ),
+                    },
+                ),
+                "wnli": StandardRecipe(
+                    card="cards.wnli",
+                    template="templates.classification.multi_class.relation.default",
+                ),
+            },
+        )
+
+        test_dataset = list(benchmark()["test"])
+
+        for instance in test_dataset:
+            self.assertTrue(instance["source"].startswith('[{"role": '))

From 059378812898d1dd0329b07d0008c3b333d243c3 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 09:35:03 +0200
Subject: [PATCH 05/26] Fix consistency and preparation

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 prepare/engines/ollama/llama2.py              | 2 +-
 src/unitxt/catalog/engines/ollama/llama2.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepare/engines/ollama/llama2.py b/prepare/engines/ollama/llama2.py
index e53999e2cf..b89e90099a 100644
--- a/prepare/engines/ollama/llama2.py
+++ b/prepare/engines/ollama/llama2.py
@@ -1,5 +1,5 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.inference import OllamaInferenceEngine
 
-inference_model = OllamaInferenceEngine(model_name="llama2")
+inference_model = OllamaInferenceEngine(model="llama2")
 add_to_catalog(inference_model, "engines.ollama.llama2", overwrite=True)
diff --git a/src/unitxt/catalog/engines/ollama/llama2.json b/src/unitxt/catalog/engines/ollama/llama2.json
index 3c5c39cc9a..9aec1ded53 100644
--- a/src/unitxt/catalog/engines/ollama/llama2.json
+++ b/src/unitxt/catalog/engines/ollama/llama2.json
@@ -1,4 +1,4 @@
 {
     "__type__": "ollama_inference_engine",
-    "model_name": "llama2"
+    "model": "llama2"
 }

From 28bafa2c301d97955bcc8f449751bff9f349ea0a Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 11:27:16 +0200
Subject: [PATCH 06/26] Update

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 examples/evaluate_benchmark_with_custom_api.py             | 6 ++++--
 ...evaluate_image_text_to_text_with_different_templates.py | 2 +-
 prepare/system_prompts/general/be_concise.py               | 7 +++++++
 pyproject.toml                                             | 2 +-
 src/unitxt/catalog/system_prompts/general/be_concise.json  | 4 ++++
 src/unitxt/inference.py                                    | 1 -
 6 files changed, 17 insertions(+), 5 deletions(-)
 create mode 100644 prepare/system_prompts/general/be_concise.py
 create mode 100644 src/unitxt/catalog/system_prompts/general/be_concise.json

diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py
index 12e2bbc57b..379c6f6050 100644
--- a/examples/evaluate_benchmark_with_custom_api.py
+++ b/examples/evaluate_benchmark_with_custom_api.py
@@ -3,12 +3,14 @@
 from unitxt.text_utils import print_dict
 
 data = load_dataset(
-    "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api]",
+    "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api, system_prompt=system_prompts.general.be_concise]",
     split="test",
     disable_cache=False,
 )
 
-model = MultiAPIInferenceEngine(model="llama-3-8b-instruct", top_k=1, api="watsonx")
+model = MultiAPIInferenceEngine(
+    model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, api="watsonx"
+)
 
 predictions = model.infer(data)
 
diff --git a/examples/evaluate_image_text_to_text_with_different_templates.py b/examples/evaluate_image_text_to_text_with_different_templates.py
index bf425b782c..6e2d132d4c 100644
--- a/examples/evaluate_image_text_to_text_with_different_templates.py
+++ b/examples/evaluate_image_text_to_text_with_different_templates.py
@@ -48,5 +48,5 @@
 
     for subset in dataset.subsets:
         logger.info(
-            f"{subset.title()}: ", results[0]["score"]["subsets"][subset]["score"]
+            f'{subset.title()}: {results[0]["score"]["subsets"][subset]["score"]}'
         )
diff --git a/prepare/system_prompts/general/be_concise.py b/prepare/system_prompts/general/be_concise.py
new file mode 100644
index 0000000000..6ac9068221
--- /dev/null
+++ b/prepare/system_prompts/general/be_concise.py
@@ -0,0 +1,7 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.system_prompts import TextualSystemPrompt
+
+system_prompt = TextualSystemPrompt(
+    "be concise. at every point give the shortest acceptable answer."
+)
+add_to_catalog(system_prompt, "system_prompts.general.be_concise", overwrite=True)
diff --git a/pyproject.toml b/pyproject.toml
index 4159c8c85e..1c575e03e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,7 +114,7 @@ watsonx = [
     "ibm-watsonx-ai==1.1.14"
 ]
 inference-tests = [
-  "litellm @ git+https://github.com/BerriAI/litellm.git@main",
+  "litellm==v1.52.9",
   "tenacity",
   "diskcache",
   "numpy==1.26.4"
diff --git a/src/unitxt/catalog/system_prompts/general/be_concise.json b/src/unitxt/catalog/system_prompts/general/be_concise.json
new file mode 100644
index 0000000000..01e54b8629
--- /dev/null
+++ b/src/unitxt/catalog/system_prompts/general/be_concise.json
@@ -0,0 +1,4 @@
+{
+    "__type__": "textual_system_prompt",
+    "text": "be concise. at every point give the shortest acceptable answer."
+}
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 02085327e4..079e32cd43 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -37,7 +37,6 @@ class StandardAPIParamsMixin(Artifact):
     stop: Union[Optional[str], List[str]] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
-    top_k: Optional[int] = None
     top_logprobs: Optional[int] = 20
     logit_bias: Optional[Dict[str, int]] = None
     logprobs: Optional[bool] = True

From 3c861fbee0be3e0ce005b405c14458854deb23fd Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 13:58:06 +0200
Subject: [PATCH 07/26] Fix test

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 tests/inference/test_inference_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index 12909913b4..a1f81495a8 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -79,7 +79,7 @@ def test_dataset_verification_inference_engine(self):
             f"data with classification '{inference_model.data_classification_policy}'. To "
             f"enable this either change the 'data_classification_policy' attribute of the "
             f"artifact, or modify the environment variable 'UNITXT_DATA_CLASSIFICATION_POLICY' "
-            f"accordingly.",
+            f"accordingly.\nFor more information: see https://www.unitxt.ai/en/latest//docs/data_classification_policy.html \n",
         )
 
     def test_llava_inference_engine(self):

From f9cd539c848284b27bb172c2ce055811aaaaa883 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 14:17:31 +0200
Subject: [PATCH 08/26] Make all args None

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 src/unitxt/inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 079e32cd43..5acb50d01b 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -37,9 +37,9 @@ class StandardAPIParamsMixin(Artifact):
     stop: Union[Optional[str], List[str]] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
-    top_logprobs: Optional[int] = 20
+    top_logprobs: Optional[int] = None
     logit_bias: Optional[Dict[str, int]] = None
-    logprobs: Optional[bool] = True
+    logprobs: Optional[bool] = None
     n: Optional[int] = None
     parallel_tool_calls: Optional[bool] = None
     service_tier: Optional[Literal["auto", "default"]] = None

From 4165c783aef6214a8ee5756d676bb2f4ed263293 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 14:51:14 +0200
Subject: [PATCH 09/26] Try

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 src/unitxt/operators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 8edf8c32b6..b761deecc4 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -400,7 +400,7 @@ def verify_field_definition(self):
         ), f"the from and to fields must be defined or implied from the other inputs got: {self._field_to_field}"
         assert (
             len(self._field_to_field) > 0
-        ), f"'input argument 'field_to_field' should convey at least one field to process. Got {self.field_to_field}"
+        ), f"'input argument '{self.__class__.__name__}.field_to_field' should convey at least one field to process. Got {self.field_to_field}"
         # self._field_to_field is built explicitly by pairs, or copied from argument 'field_to_field'
         if self.field_to_field is None:
             return

From f202c3a7947d5a3033bd6c92e50bcfc3b9a8fdd7 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 15:10:44 +0200
Subject: [PATCH 10/26] Fix grammar

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 prepare/system_prompts/general/be_concise.py              | 2 +-
 src/unitxt/catalog/system_prompts/general/be_concise.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepare/system_prompts/general/be_concise.py b/prepare/system_prompts/general/be_concise.py
index 6ac9068221..4b5d69aeda 100644
--- a/prepare/system_prompts/general/be_concise.py
+++ b/prepare/system_prompts/general/be_concise.py
@@ -2,6 +2,6 @@
 from unitxt.system_prompts import TextualSystemPrompt
 
 system_prompt = TextualSystemPrompt(
-    "be concise. at every point give the shortest acceptable answer."
+    "Be concise. At every point give the shortest acceptable answer."
 )
 add_to_catalog(system_prompt, "system_prompts.general.be_concise", overwrite=True)
diff --git a/src/unitxt/catalog/system_prompts/general/be_concise.json b/src/unitxt/catalog/system_prompts/general/be_concise.json
index 01e54b8629..48f28b5bb2 100644
--- a/src/unitxt/catalog/system_prompts/general/be_concise.json
+++ b/src/unitxt/catalog/system_prompts/general/be_concise.json
@@ -1,4 +1,4 @@
 {
     "__type__": "textual_system_prompt",
-    "text": "be concise. at every point give the shortest acceptable answer."
+    "text": "Be concise. At every point give the shortest acceptable answer."
 }

From bd8e176fda74a4c233c85a7cab0e63acbbb6bc6c Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 15:25:01 +0200
Subject: [PATCH 11/26] Fix

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 examples/evaluate_benchmark.py                |  3 +--
 .../evaluate_different_demo_selections.py     |  9 ++++----
 examples/evaluate_image_text_to_text.py       | 10 ++++----
 prepare/tasks/qa/multiple_choice/tasks.py     |  4 ++++
 .../tasks/qa/multiple_choice/open.json        |  3 +++
 .../qa/multiple_choice/with_context.json      |  4 ++++
 .../with_context/with_topic.json              |  4 ++++
 .../tasks/qa/multiple_choice/with_topic.json  |  4 ++++
 src/unitxt/standard.py                        | 23 +++++++++++--------
 9 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/examples/evaluate_benchmark.py b/examples/evaluate_benchmark.py
index dca439f160..e92b7e309a 100644
--- a/examples/evaluate_benchmark.py
+++ b/examples/evaluate_benchmark.py
@@ -48,9 +48,8 @@
 
 
 # Infere using flan t5 base using HF API
-model_name = "google/flan-t5-base"
 inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+    model_name="google/flan-t5-base", max_new_tokens=32
 )
 
 predictions = inference_model.infer(test_dataset)
diff --git a/examples/evaluate_different_demo_selections.py b/examples/evaluate_different_demo_selections.py
index 61d7d68376..d494b89f43 100644
--- a/examples/evaluate_different_demo_selections.py
+++ b/examples/evaluate_different_demo_selections.py
@@ -32,14 +32,13 @@
             num_demos=num_demos,
             demos_pool_size=50,
             loader_limit=200,
-            max_test_instances=100,
+            max_test_instances=10,
             sampler=demo_sampler,
+            split="test",
         )
 
-        test_dataset = dataset["test"]
-
-        predictions = inference_model.infer(test_dataset)
-        evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+        predictions = inference_model.infer(dataset)
+        evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
         logger.info(
             f"Sample input and output for sampler {demo_sampler} and num_demos '{num_demos}':"
diff --git a/examples/evaluate_image_text_to_text.py b/examples/evaluate_image_text_to_text.py
index a4f0dc6c10..1edbe02e62 100644
--- a/examples/evaluate_image_text_to_text.py
+++ b/examples/evaluate_image_text_to_text.py
@@ -6,19 +6,19 @@
 with settings.context(
     disable_hf_datasets_cache=False,
 ):
-    inference_model = HFLlavaInferenceEngine(
-        model_name="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=32
-    )
-
     dataset = load_dataset(
         card="cards.doc_vqa.lmms_eval",
         template="templates.qa.with_context.title",
         format="formats.chat_api",
-        loader_limit=300,
+        loader_limit=10,
         augmentor="augmentors.image.grey_scale",
         split="test",
     )
 
+    inference_model = HFLlavaInferenceEngine(
+        model_name="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=32
+    )
+
     predictions = inference_model.infer(dataset)
     evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
diff --git a/prepare/tasks/qa/multiple_choice/tasks.py b/prepare/tasks/qa/multiple_choice/tasks.py
index cf8952e295..8fe83cad0b 100644
--- a/prepare/tasks/qa/multiple_choice/tasks.py
+++ b/prepare/tasks/qa/multiple_choice/tasks.py
@@ -14,6 +14,7 @@
         },
         reference_fields={"answer": Union[int, str], "choices": List[str]},
         prediction_type=str,
+        augmentable_inputs=["context", "question"],
         metrics=["metrics.accuracy"],
     ),
     "tasks.qa.multiple_choice.with_context",
@@ -26,6 +27,7 @@
         input_fields={"topic": str, "question": str, "choices": List[str]},
         reference_fields={"answer": Union[int, str], "choices": List[str]},
         prediction_type=str,
+        augmentable_inputs=["topic", "question"],
         metrics=["metrics.accuracy"],
     ),
     "tasks.qa.multiple_choice.with_topic",
@@ -37,6 +39,7 @@
         input_fields={"question": str, "choices": List[str]},
         reference_fields={"answer": Union[int, str], "choices": List[str]},
         prediction_type=str,
+        augmentable_inputs=["question"],
         metrics=["metrics.accuracy"],
     ),
     "tasks.qa.multiple_choice.open",
@@ -54,6 +57,7 @@
         },
         reference_fields={"answer": Union[int, str], "choices": List[str]},
         prediction_type=str,
+        augmentable_inputs=["context", "question"],
         metrics=["metrics.accuracy"],
     ),
     "tasks.qa.multiple_choice.with_context.with_topic",
diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/open.json b/src/unitxt/catalog/tasks/qa/multiple_choice/open.json
index 1cd21924d0..a6422737f8 100644
--- a/src/unitxt/catalog/tasks/qa/multiple_choice/open.json
+++ b/src/unitxt/catalog/tasks/qa/multiple_choice/open.json
@@ -9,6 +9,9 @@
         "choices": "List[str]"
     },
     "prediction_type": "str",
+    "augmentable_inputs": [
+        "question"
+    ],
     "metrics": [
         "metrics.accuracy"
     ]
diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json
index a223467784..be5de61a53 100644
--- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json
+++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json
@@ -11,6 +11,10 @@
         "choices": "List[str]"
     },
     "prediction_type": "str",
+    "augmentable_inputs": [
+        "context",
+        "question"
+    ],
     "metrics": [
         "metrics.accuracy"
     ]
diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json
index 97895cc350..5bb4cbb295 100644
--- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json
+++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json
@@ -12,6 +12,10 @@
         "choices": "List[str]"
     },
     "prediction_type": "str",
+    "augmentable_inputs": [
+        "context",
+        "question"
+    ],
     "metrics": [
         "metrics.accuracy"
     ]
diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json
index 24e86e13ad..da7184663a 100644
--- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json
+++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json
@@ -10,6 +10,10 @@
         "choices": "List[str]"
     },
     "prediction_type": "str",
+    "augmentable_inputs": [
+        "topic",
+        "question"
+    ],
     "metrics": [
         "metrics.accuracy"
     ]
diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py
index a8dbe46bad..6982b2c7ca 100644
--- a/src/unitxt/standard.py
+++ b/src/unitxt/standard.py
@@ -3,12 +3,11 @@
 from .artifact import fetch_artifact
 from .augmentors import (
     Augmentor,
-    NullAugmentor,
-    TaskInputsAugmentor,
 )
 from .card import TaskCard
 from .collections_operators import GetLength
 from .dataclass import Field, InternalField, NonPositionalField, OptionalField
+from .error_utils import UnitxtError
 from .formats import Format, SystemFormat
 from .logging_utils import get_logger
 from .operator import SequentialOperator, SourceSequentialOperator, StreamingOperator
@@ -69,9 +68,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator):
     demos_field: str = "demos"
     sampler: Sampler = None
 
-    augmentor: Union[Augmentor, List[Augmentor]] = OptionalField(
-        default_factory=NullAugmentor
-    )
+    augmentor: Union[Augmentor, List[Augmentor]] = OptionalField(default=None)
 
     steps: List[StreamingOperator] = InternalField(default_factory=list)
 
@@ -308,11 +305,19 @@ def reset_pipeline(self):
 
         self.processing.steps.append(self.task)
 
-        if not isinstance(self.augmentor, list):
-            self.augmentor = [self.augmentor]
+        if self.augmentor is not None:
+            if (
+                self.card.task.augmentable_inputs is None
+                or len(self.task.augmentable_inputs) == 0
+            ):
+                raise UnitxtError(
+                    f"You specified augmentor in the recipe but the got task without augmentable_inputs: {self.task}"
+                )
+
+            if not isinstance(self.augmentor, list):
+                self.augmentor = [self.augmentor]
 
-        for augmentor in self.augmentor:
-            if isinstance(augmentor, TaskInputsAugmentor):
+            for augmentor in self.augmentor:
                 augmentor.set_fields(self.card.task.augmentable_inputs)
                 self.processing.steps.append(augmentor)
 

From b686f95f07c080d4ed790e356e5a3f336cc794aa Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 15:54:05 +0200
Subject: [PATCH 12/26] Change api to provider

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 ...valuate_benchmark_with_custom_provider.py} |  6 ++---
 prepare/engines/multi_api/llama3.py           | 12 ++--------
 .../engines/model/llama_3_8b_instruct.json    | 12 ++--------
 src/unitxt/inference.py                       | 24 +++++++++----------
 src/unitxt/settings_utils.py                  |  2 +-
 5 files changed, 20 insertions(+), 36 deletions(-)
 rename examples/{evaluate_benchmark_with_custom_api.py => evaluate_benchmark_with_custom_provider.py} (77%)

diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_provider.py
similarity index 77%
rename from examples/evaluate_benchmark_with_custom_api.py
rename to examples/evaluate_benchmark_with_custom_provider.py
index 379c6f6050..371f97f517 100644
--- a/examples/evaluate_benchmark_with_custom_api.py
+++ b/examples/evaluate_benchmark_with_custom_provider.py
@@ -1,5 +1,5 @@
 from unitxt import evaluate, load_dataset
-from unitxt.inference import MultiAPIInferenceEngine
+from unitxt.inference import CrossProviderModel
 from unitxt.text_utils import print_dict
 
 data = load_dataset(
@@ -8,8 +8,8 @@
     disable_cache=False,
 )
 
-model = MultiAPIInferenceEngine(
-    model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, api="watsonx"
+model = CrossProviderModel(
+    model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, provider="watsonx"
 )
 
 predictions = model.infer(data)
diff --git a/prepare/engines/multi_api/llama3.py b/prepare/engines/multi_api/llama3.py
index 8ebaa4adf2..8b3ee4494e 100644
--- a/prepare/engines/multi_api/llama3.py
+++ b/prepare/engines/multi_api/llama3.py
@@ -1,16 +1,8 @@
 from unitxt.catalog import add_to_catalog
-from unitxt.inference import MultiAPIInferenceEngine
+from unitxt.inference import CrossProviderModel
 
-engine = MultiAPIInferenceEngine(
+engine = CrossProviderModel(
     model="llama-3-8b-instruct",
-    api_model_map={
-        "watsonx": {
-            "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
-        },
-        "together-ai": {
-            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct"
-        },
-    },
 )
 
 add_to_catalog(engine, "engines.model.llama_3_8b_instruct", overwrite=True)
diff --git a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
index a6c2be46c0..ab9eee5364 100644
--- a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
+++ b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
@@ -1,12 +1,4 @@
 {
-    "__type__": "multi_api_inference_engine",
-    "model": "llama-3-8b-instruct",
-    "api_model_map": {
-        "watsonx": {
-            "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct"
-        },
-        "together-ai": {
-            "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct"
-        }
-    }
+    "__type__": "cross_provider_model",
+    "model": "llama-3-8b-instruct"
 }
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 5acb50d01b..15a308c6f2 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1669,8 +1669,8 @@ def _infer(
 _supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama"]
 
 
-class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
-    """Inference engine capable of dynamically switching between multiple APIs.
+class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin):
+    """Inference engine capable of dynamically switching between multiple providers APIs.
 
     This class extends the InferenceEngine and OpenAiInferenceEngineParamsMixin
     to enable seamless integration with various API providers. The supported APIs are
@@ -1687,9 +1687,9 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             across different API backends.
     """
 
-    api: Optional[_supported_apis] = None
+    provider: Optional[_supported_apis] = None
 
-    api_model_map: Dict[_supported_apis, Dict[str, str]] = {
+    provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
         "watsonx": {
             "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
             "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct",
@@ -1708,7 +1708,7 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         },
     }
 
-    _api_to_base_class = {
+    _provider_to_base_class = {
         "watsonx": LiteLLMInferenceEngine,
         "open-ai": LiteLLMInferenceEngine,
         "together-ai": LiteLLMInferenceEngine,
@@ -1716,14 +1716,14 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "ollama": OllamaInferenceEngine,
     }
 
-    def get_api_name(self):
-        return self.api if self.api is not None else settings.default_inference_api
+    def get_provider_name(self):
+        return self.provider if self.provider is not None else settings.default_provider
 
     def prepare_engine(self):
-        api = self.get_api_name()
-        cls = self.__class__._api_to_base_class[api]
+        provider = self.get_provider_name()
+        cls = self.__class__._provider_to_base_class[provider]
         args = self.to_dict([StandardAPIParamsMixin])
-        args["model"] = self.api_model_map[api][self.model]
+        args["model"] = self.provider_model_map[provider][self.model]
         self.engine = cls(**args)
 
     def _infer(
@@ -1734,8 +1734,8 @@ def _infer(
         return self.engine._infer(dataset, return_meta_data)
 
     def get_engine_id(self):
-        api = self.get_api_name()
-        return get_model_and_label_id(self.api_model_map[api][self.model], api)
+        api = self.get_provider_name()
+        return get_model_and_label_id(self.provider_model_map[api][self.model], api)
 
 
 class HFOptionSelectingInferenceEngine(InferenceEngine):
diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
index 9a03cf81e0..a95cacfe30 100644
--- a/src/unitxt/settings_utils.py
+++ b/src/unitxt/settings_utils.py
@@ -152,7 +152,7 @@ def __getattr__(self, key):
     settings.disable_hf_datasets_cache = (bool, True)
     settings.loader_cache_size = (int, 1)
     settings.task_data_as_text = (bool, True)
-    settings.default_inference_api = "watsonx"
+    settings.default_provider = "watsonx"
     settings.default_format = None
 
 if Constants.is_uninitilized():

From 4c91d5ed3987b976912a05d17f473c5b3801d9ec Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 Nov 2024 16:03:12 +0200
Subject: [PATCH 13/26] Added support for param renaming.

Added BAM and improved error messages.

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/inference.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 15a308c6f2..852d5aa58c 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -18,6 +18,7 @@
 from .artifact import Artifact, fetch_artifact
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
+from .error_utils import UnitxtError
 from .image_operators import data_url_to_image, extract_images
 from .logging_utils import get_logger
 from .operator import PackageRequirementsMixin
@@ -1589,7 +1590,7 @@ class LiteLLMInferenceEngine(
     max_requests_per_second: float = 6
     max_retries: int = 5  # Set to 0 to prevent internal retries
 
-    requirements: list = ["litellm", "tenacity", "tqdm", "diskcache"]
+    _requirements_list: list = ["litellm", "tenacity", "tqdm", "diskcache"]
 
     def prepare_engine(self):
         # Initialize the token bucket rate limiter
@@ -1693,6 +1694,7 @@ class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin):
         "watsonx": {
             "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
             "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct",
+            "granite-3-8b-instruct": "watsonx/ibm/granite-3-8b-instruct",
         },
         "together-ai": {
             "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct",
@@ -1706,6 +1708,10 @@ class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-8b-instruct": "llama3:8b",
             "llama-3-70b-instruct": "llama3:70b",
         },
+        "bam": {
+            "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
+            "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct",
+        },
     }
 
     _provider_to_base_class = {
@@ -1714,6 +1720,11 @@ class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin):
         "together-ai": LiteLLMInferenceEngine,
         "aws": LiteLLMInferenceEngine,
         "ollama": OllamaInferenceEngine,
+        "bam": IbmGenAiInferenceEngine,
+    }
+
+    _provider_param_renaming = {
+        "bam": {"max_tokens": "max_new_tokens", "model": "model_name"}
     }
 
     def get_provider_name(self):
@@ -1721,9 +1732,26 @@ def get_provider_name(self):
 
     def prepare_engine(self):
         provider = self.get_provider_name()
+        if provider not in self._provider_to_base_class:
+            raise UnitxtError(
+                f"{provider} a known API. Supported apis: {','.join(self.provider_model_map.keys())}"
+            )
+        if self.model not in self.provider_model_map[api]:
+            raise UnitxtError(
+                f"{self.model} is not configured for provider {provider}. Supported models: {','.join(self.provider_model_map[api].keys())}"
+            )
         cls = self.__class__._provider_to_base_class[provider]
         args = self.to_dict([StandardAPIParamsMixin])
-        args["model"] = self.provider_model_map[provider][self.model]
+        args["model"] = self.provider_model_map[provider][self.model]        
+        params = list(args.keys())
+        if provider in self._provider_param_renaming:
+            for param in params:
+                if args[param] is not None:
+                    if param in self._provider_param_renaming[provider]:
+                        args[self._provider_param_renaming[provider][param]] = args[param]
+                        del args[param]
+                else:
+                    del args[param]
         self.engine = cls(**args)
 
     def _infer(

From eaead52cd366dc5201b2670675074dfaf3655cb6 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 Nov 2024 16:23:40 +0200
Subject: [PATCH 14/26] Fix merge issues

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 src/unitxt/inference.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 852d5aa58c..5e41946f32 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1736,19 +1736,21 @@ def prepare_engine(self):
             raise UnitxtError(
                 f"{provider} a known API. Supported apis: {','.join(self.provider_model_map.keys())}"
             )
-        if self.model not in self.provider_model_map[api]:
+        if self.model not in self.provider_model_map[provider]:
             raise UnitxtError(
-                f"{self.model} is not configured for provider {provider}. Supported models: {','.join(self.provider_model_map[api].keys())}"
+                f"{self.model} is not configured for provider {provider}. Supported models: {','.join(self.provider_model_map[provider].keys())}"
             )
         cls = self.__class__._provider_to_base_class[provider]
         args = self.to_dict([StandardAPIParamsMixin])
-        args["model"] = self.provider_model_map[provider][self.model]        
+        args["model"] = self.provider_model_map[provider][self.model]
         params = list(args.keys())
         if provider in self._provider_param_renaming:
             for param in params:
                 if args[param] is not None:
                     if param in self._provider_param_renaming[provider]:
-                        args[self._provider_param_renaming[provider][param]] = args[param]
+                        args[self._provider_param_renaming[provider][param]] = args[
+                            param
+                        ]
                         del args[param]
                 else:
                     del args[param]

From 4c5ba45aa65c27fab49f56e443553637ad321565 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Mon, 18 Nov 2024 16:25:35 +0200
Subject: [PATCH 15/26] Updated to CrossProviderModel

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 ...luate_batched_multiclass_classification.py | 184 ++++++++++--------
 1 file changed, 98 insertions(+), 86 deletions(-)

diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py
index 8c3c4382d1..d803ebad51 100644
--- a/examples/evaluate_batched_multiclass_classification.py
+++ b/examples/evaluate_batched_multiclass_classification.py
@@ -28,7 +28,7 @@ class ParseEnumeratedList(FieldOperator):
     def process_value(self, text: Any) -> Any:
         result = []
         for x in text.split("\n"):
-            line_result = re.findall(r"(\d+)\.\s*(\w+)", x)
+            line_result = re.findall(r"(\d+)\.\s*(.*)", x)
             if len(line_result) == 1:
                 result.append(line_result[0])
         return result
@@ -63,96 +63,108 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
 
 template = InputOutputTemplate(
     input_format="Classify each of the texts to its corresponding {type_of_class} from one of these options:\n{classes}\nReturn for each index the correspond class in a separate line.\nTexts:\n{texts}",
-    #   target_prefix="Answer:\n",
+    target_prefix="Answer:\n",
     output_format="{labels}",
-    postprocessors=[PostProcess(ParseEnumeratedList())],
+    postprocessors=["processors.lower_case", PostProcess(ParseEnumeratedList())],
     serializer=MultiTypeSerializer(serializers=[EnumeratedListSerializer()]),
 )
 df = pd.DataFrame(
-    columns=["model", "batch_size", "num_instances", "f1_micro", "ci_low", "ci_high"]
+    columns=[
+        "provider",
+        "model",
+        "batch_size",
+        "num_instances",
+        "f1_micro",
+        "ci_low",
+        "ci_high",
+        "hellucinations",
+    ]
 )
 
-for model_name in [
-    "ibm/granite-3-8b-instruct",
-    "meta-llama/llama-3-8b-instruct",
+for provider in [
+    "watsonx",
+    "bam",
 ]:
-    if model_name.startswith("ibm"):
-        format = SystemFormat(
-            demo_format=(
-                "{instruction}\\N{source}\\N<|end_of_text|>\n"
-                "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n"
-                "<|start_of_role|>user<|end_of_role|>"
-            ),
-            model_input_format=(
-                "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n"
-                "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n"
-                "<|start_of_role|>assistant<|end_of_role|>"
-            ),
-        )
-        batch_sizes = [50, 30, 10, 1]
-
-    if model_name.startswith("meta-llama"):
-        format = "formats.llama3_instruct"
-        batch_sizes = [100, 50, 10, 1]
-
-    for batch_size in batch_sizes:
-        card, _ = fetch_artifact("cards.sst2")
-        card.preprocess_steps.extend(
-            [
-                CollateInstances(batch_size=batch_size),
-                Rename(field_to_field={"text": "texts", "label": "labels"}),
-                Copy(field="text_type/0", to_field="text_type"),
-                Copy(field="classes/0", to_field="classes"),
-                Copy(field="type_of_class/0", to_field="type_of_class"),
+    for model_name in [
+        "granite-3-8b-instruct",
+        "llama-3-8b-instruct",
+    ]:
+        batch_sizes = [30, 20, 10, 5, 1]
+
+        for batch_size in batch_sizes:
+            card, _ = fetch_artifact("cards.banking77")
+            card.preprocess_steps.extend(
+                [
+                    CollateInstances(batch_size=batch_size),
+                    Rename(field_to_field={"text": "texts", "label": "labels"}),
+                    Copy(field="text_type/0", to_field="text_type"),
+                    Copy(field="classes/0", to_field="classes"),
+                    Copy(field="type_of_class/0", to_field="type_of_class"),
+                ]
+            )
+            card.task = task
+            card.templates = [template]
+            format = "formats.chat_api"
+            if provider == "bam" and model_name.startswith("llama"):
+                format = "formats.llama3_instruct"
+            if provider == "bam" and model_name.startswith("granite"):
+                format = SystemFormat(
+                    demo_format=(
+                        "{instruction}\\N{source}\\N<|end_of_text|>\n"
+                        "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n"
+                        "<|start_of_role|>user<|end_of_role|>"
+                    ),
+                    model_input_format=(
+                        "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n"
+                        "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n"
+                        "<|start_of_role|>assistant<|end_of_role|>"
+                    ),
+                )
+
+            dataset = load_dataset(
+                card=card,
+                template_card_index=0,
+                format=format,
+                num_demos=1,
+                demos_pool_size=5,
+                loader_limit=1000,
+                max_test_instances=200 / batch_size,
+            )
+
+            test_dataset = dataset["test"]
+            from unitxt.inference import CrossProviderModel
+
+            inference_model = CrossProviderModel(
+                model=model_name, max_tokens=1024, provider=provider
+            )
+            predictions = inference_model.infer(test_dataset)
+
+            evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+            # import pandas as pd
+            # result_df = pd.json_normalize(evaluated_dataset)
+            # result_df.to_csv(f"output.csv")
+            # Print results
+            print_dict(
+                evaluated_dataset[0],
+                keys_to_print=[
+                    "source",
+                    "prediction",
+                    "processed_prediction",
+                    "processed_references",
+                ],
+            )
+
+            global_scores = evaluated_dataset[0]["score"]["global"]
+            df.loc[len(df)] = [
+                provider,
+                model_name,
+                batch_size,
+                global_scores["num_of_instances"],
+                global_scores["score"],
+                global_scores["score_ci_low"],
+                global_scores["score_ci_high"],
+                1.0 - global_scores["in_classes_support"],
             ]
-        )
-        card.task = task
-        card.templates = [template]
-
-        dataset = load_dataset(
-            card=card,
-            template_card_index=0,
-            format=format,
-            num_demos=1,
-            demos_pool_size=5,
-            loader_limit=10000,
-            max_test_instances=1000 / batch_size,
-        )
-
-        test_dataset = dataset["test"]
-
-        # inference_model = IbmGenAiInferenceEngine(
-        #    model_name=model_name, max_new_tokens=1024
-        # )
-
-        from unitxt.inference import WMLInferenceEngine
-
-        inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=1024)
-
-        predictions = inference_model.infer(test_dataset)
-
-        evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
-
-        # Print results
-        print_dict(
-            evaluated_dataset[0],
-            keys_to_print=[
-                "source",
-                "prediction",
-                "processed_prediction",
-                "processed_references",
-            ],
-        )
-
-        global_scores = evaluated_dataset[0]["score"]["global"]
-        df.loc[len(df)] = [
-            model_name,
-            batch_size,
-            global_scores["num_of_instances"],
-            global_scores["score"],
-            global_scores["score_ci_low"],
-            global_scores["score_ci_high"],
-        ]
-
-        df = df.round(decimals=2)
-        logger.info(df.to_markdown())
+
+            df = df.round(decimals=2)
+            logger.info(df.to_markdown())

From 00dbd304ee38a691bec91aaaf888d504813d45cb Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Mon, 18 Nov 2024 18:16:33 +0200
Subject: [PATCH 16/26] Update name back to InferenceEngine terminology

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 examples/evaluate_batched_multiclass_classification.py    | 4 ++--
 examples/evaluate_benchmark_with_custom_provider.py       | 4 ++--
 prepare/engines/multi_api/llama3.py                       | 4 ++--
 src/unitxt/catalog/engines/model/llama_3_8b_instruct.json | 2 +-
 src/unitxt/inference.py                                   | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py
index d803ebad51..9247965943 100644
--- a/examples/evaluate_batched_multiclass_classification.py
+++ b/examples/evaluate_batched_multiclass_classification.py
@@ -132,9 +132,9 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
             )
 
             test_dataset = dataset["test"]
-            from unitxt.inference import CrossProviderModel
+            from unitxt.inference import CrossProviderInferenceEngine
 
-            inference_model = CrossProviderModel(
+            inference_model = CrossProviderInferenceEngine(
                 model=model_name, max_tokens=1024, provider=provider
             )
             predictions = inference_model.infer(test_dataset)
diff --git a/examples/evaluate_benchmark_with_custom_provider.py b/examples/evaluate_benchmark_with_custom_provider.py
index 371f97f517..6e3e45f577 100644
--- a/examples/evaluate_benchmark_with_custom_provider.py
+++ b/examples/evaluate_benchmark_with_custom_provider.py
@@ -1,5 +1,5 @@
 from unitxt import evaluate, load_dataset
-from unitxt.inference import CrossProviderModel
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.text_utils import print_dict
 
 data = load_dataset(
@@ -8,7 +8,7 @@
     disable_cache=False,
 )
 
-model = CrossProviderModel(
+model = CrossProviderInferenceEngine(
     model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, provider="watsonx"
 )
 
diff --git a/prepare/engines/multi_api/llama3.py b/prepare/engines/multi_api/llama3.py
index 8b3ee4494e..8a1272badf 100644
--- a/prepare/engines/multi_api/llama3.py
+++ b/prepare/engines/multi_api/llama3.py
@@ -1,7 +1,7 @@
 from unitxt.catalog import add_to_catalog
-from unitxt.inference import CrossProviderModel
+from unitxt.inference import CrossProviderInferenceEngine
 
-engine = CrossProviderModel(
+engine = CrossProviderInferenceEngine(
     model="llama-3-8b-instruct",
 )
 
diff --git a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
index ab9eee5364..ac8e9eac36 100644
--- a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
+++ b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json
@@ -1,4 +1,4 @@
 {
-    "__type__": "cross_provider_model",
+    "__type__": "cross_provider_inference_engine",
     "model": "llama-3-8b-instruct"
 }
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 5e41946f32..e3e5f10d19 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1667,10 +1667,10 @@ def _infer(
         return [response.prediction for response in responses]
 
 
-_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama"]
+_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
 
 
-class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin):
+class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     """Inference engine capable of dynamically switching between multiple providers APIs.
 
     This class extends the InferenceEngine and OpenAiInferenceEngineParamsMixin

From a0373f8f5392760486db917ef86924e85e3f4b96 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 10:48:41 +0200
Subject: [PATCH 17/26] Align all examples with chat api and cross provider
 engines

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 docs/docs/examples.rst                        |  9 +--
 docs/docs/inference.rst                       |  0
 ..._judge_model_capabilities_on_arena_hard.py | 30 ++++----
 examples/evaluate_a_model_using_arena_hard.py | 32 ++++----
 ...luate_batched_multiclass_classification.py |  7 ++
 examples/evaluate_benchmark.py                | 15 +++-
 ...evaluate_benchmark_with_custom_provider.py |  7 ++
 examples/evaluate_bluebench.py                | 14 +++-
 .../evaluate_different_demo_selections.py     | 17 +++--
 examples/evaluate_different_formats.py        | 15 +++-
 examples/evaluate_different_templates.py      | 25 ++++---
 ...aluate_existing_dataset_by_llm_as_judge.py | 70 ++++++++++--------
 .../evaluate_existing_dataset_with_install.py | 45 ++++-------
 examples/evaluate_image_text_to_text.py       |  4 +-
 ..._image_text_to_text_lmms_eval_inference.py |  1 +
 examples/evaluate_rag_response_generation.py  | 23 ++++--
 ...uate_summarization_dataset_llm_as_judge.py | 74 ++++++++++---------
 examples/evaluate_using_metrics_ensemble.py   | 19 +++--
 examples/qa_evaluation.py                     | 33 +++++----
 examples/run_generic_inference_engine.py      | 52 -------------
 .../standalone_evaluation_llm_as_judge.py     | 39 +++++-----
 examples/standalone_qa_evaluation.py          | 42 ++++-------
 .../llama_3_arena_hard_template.py            | 23 +++++-
 .../llama_3_ibm_genai_generic_template.py     | 68 +++++++++++------
 prepare/recipes/bluebench.py                  |  2 +-
 .../{watsonx => }/template_arena_hard.json    |  4 +-
 .../template_arena_hard.json                  | 12 +++
 .../generic_single_turn.json                  | 13 ++++
 .../generic_single_turn_with_reference.json   | 13 ++++
 src/unitxt/inference.py                       | 40 ++++++----
 tests/examples/test_examples.py               | 14 ++--
 tests/inference/test_inference_engine.py      | 20 +++++
 32 files changed, 444 insertions(+), 338 deletions(-)
 create mode 100644 docs/docs/inference.rst
 delete mode 100644 examples/run_generic_inference_engine.py
 rename src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/{watsonx => }/template_arena_hard.json (75%)
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json
 create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json

diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
index b935ecff13..35267d34de 100644
--- a/docs/docs/examples.rst
+++ b/docs/docs/examples.rst
@@ -1,6 +1,6 @@
 .. _examples:
 ==============
-Examples 
+Examples
 ==============
 
 Here you will find complete coding samples showing how to perform different tasks using Unitxt.
@@ -97,16 +97,16 @@ Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Format
 Long Context
 +++++++++++++++++++++++++++++
 
-This example explores the effect of long context in classification.  
+This example explores the effect of long context in classification.
 It converts a standard multi class classification dataset (sst2 sentiment classification),
 where single sentence texts are classified one by one, to a dataset
-where multiple sentences are classified using a single LLM call.  
+where multiple sentences are classified using a single LLM call.
 It compares the f1_micro in both approaches on two models.
 It uses serializers to verbalize and enumerated list of multiple sentences and labels.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_batched_multiclass_classification.py>`_
 
-Related documentation:  :ref:`Sst2 dataset card in catalog <catalog.cards.sst2>` :ref:`Types and Serializers Guide <types_and_serializers>`. 
+Related documentation:  :ref:`Sst2 dataset card in catalog <catalog.cards.sst2>` :ref:`Types and Serializers Guide <types_and_serializers>`.
 
 Construct a benchmark of multiple datasets and obtain the final score
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -265,4 +265,3 @@ This example show how to define new data types as well as the way these data typ
 
 Related documentation: :ref:`Types and Serializers Guide <types_and_serializers>`.
 
-
diff --git a/docs/docs/inference.rst b/docs/docs/inference.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
index c8d91f9fa8..4b608596a2 100644
--- a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
+++ b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
@@ -1,32 +1,28 @@
 from unitxt import evaluate, load_dataset
-from unitxt.inference import MockInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.text_utils import print_dict
 
-model_id = "meta-llama/llama-3-70b-instruct"
-model_format = "formats.llama3_instruct"
-
 """
-We are evaluating only on a small subset (by using "select(range(4)), in order for the example to finish quickly.
+We are evaluating only on a small subset (by using `max_test_instances=4`), in order for the example to finish quickly.
 The dataset full size if around 40k examples. You should use around 1k-4k in your evaluations.
 """
 dataset = load_dataset(
     card="cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",
     template="templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling",
-    format=model_format,
-)["test"].select(range(4))
+    format="formats.chat_api",
+    max_test_instances=4,
+    split="test",
+)
 
-inference_model = MockInferenceEngine(model_name=model_id)
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", provider="watsonx"
+)
 """
-We are using a mock inference engine (and model) in order for the example to finish quickly.
-In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following:
-from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine)
-and switch them with the MockInferenceEngine class in the example.
-For the arguments these inference engines can receive, please refer to the classes documentation.
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
 
-Example of using an IBM model:
-from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin)
-params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42)
-inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params)
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
 """
 
 predictions = inference_model.infer(dataset)
diff --git a/examples/evaluate_a_model_using_arena_hard.py b/examples/evaluate_a_model_using_arena_hard.py
index ce42fc38f7..ad4410d29f 100644
--- a/examples/evaluate_a_model_using_arena_hard.py
+++ b/examples/evaluate_a_model_using_arena_hard.py
@@ -1,35 +1,31 @@
 from unitxt import evaluate, load_dataset
-from unitxt.inference import MockInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.text_utils import print_dict
 
-model_id = "meta-llama/llama-3-70b-instruct"
-model_format = "formats.llama3_instruct"
-
 """
-We are evaluating only on a small subset (by using "select(range(4)), in order for the example to finish quickly.
+We are evaluating only on a small subset (by using `max_test_instances=4`), in order for the example to finish quickly.
 The dataset full size if around 40k examples. You should use around 1k-4k in your evaluations.
 """
 dataset = load_dataset(
     card="cards.arena_hard.generation.english_gpt_4_0314_reference",
     template="templates.empty",
-    format=model_format,
+    format="formats.chat_api",
     metrics=[
-        "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling"
+        "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct.template_arena_hard"
     ],
-)["test"].select(range(4))
+    max_test_instances=4,
+    split="test",
+)
 
-inference_model = MockInferenceEngine(model_name=model_id)
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", provider="watsonx"
+)
 """
-We are using a mock inference engine (and model) in order for the example to finish quickly.
-In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following:
-from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine)
-and switch them with the MockInferenceEngine class in the example.
-For the arguments these inference engines can receive, please refer to the classes documentation.
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
 
-Example of using an IBM model:
-from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin)
-params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42)
-inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params)
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
 """
 
 predictions = inference_model.infer(dataset)
diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py
index 9247965943..2a6540dc61 100644
--- a/examples/evaluate_batched_multiclass_classification.py
+++ b/examples/evaluate_batched_multiclass_classification.py
@@ -137,6 +137,13 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
             inference_model = CrossProviderInferenceEngine(
                 model=model_name, max_tokens=1024, provider=provider
             )
+            """
+            We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+            watsonx, bam, openai, azure, aws and more.
+
+            For the arguments these inference engines can receive, please refer to the classes documentation or read
+            about the the open ai api arguments the CrossProviderInferenceEngine follows.
+            """
             predictions = inference_model.infer(test_dataset)
 
             evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
diff --git a/examples/evaluate_benchmark.py b/examples/evaluate_benchmark.py
index e92b7e309a..fd09906177 100644
--- a/examples/evaluate_benchmark.py
+++ b/examples/evaluate_benchmark.py
@@ -1,7 +1,7 @@
 from unitxt.api import evaluate
 from unitxt.benchmark import Benchmark
 from unitxt.inference import (
-    HFPipelineBasedInferenceEngine,
+    CrossProviderInferenceEngine,
 )
 from unitxt.standard import StandardRecipe
 from unitxt.text_utils import print_dict
@@ -47,10 +47,17 @@
 test_dataset = list(benchmark()["test"])
 
 
-# Infere using flan t5 base using HF API
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name="google/flan-t5-base", max_new_tokens=32
+# Infere using llama-3-2-1b base using Watsonx API
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", provider="watsonx"
 )
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
 
 predictions = inference_model.infer(test_dataset)
 evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
diff --git a/examples/evaluate_benchmark_with_custom_provider.py b/examples/evaluate_benchmark_with_custom_provider.py
index 6e3e45f577..d1b570b1c7 100644
--- a/examples/evaluate_benchmark_with_custom_provider.py
+++ b/examples/evaluate_benchmark_with_custom_provider.py
@@ -11,6 +11,13 @@
 model = CrossProviderInferenceEngine(
     model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, provider="watsonx"
 )
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
 
 predictions = model.infer(data)
 
diff --git a/examples/evaluate_bluebench.py b/examples/evaluate_bluebench.py
index 2268224c67..27efaaf9c8 100644
--- a/examples/evaluate_bluebench.py
+++ b/examples/evaluate_bluebench.py
@@ -1,6 +1,6 @@
 from unitxt import evaluate, load_dataset, settings
 from unitxt.inference import (
-    LiteLLMInferenceEngine,
+    CrossProviderInferenceEngine,
 )
 from unitxt.text_utils import print_dict
 
@@ -11,11 +11,17 @@
     test_dataset = load_dataset("benchmarks.bluebench", split="test")
 
 # Infer
-inference_model = LiteLLMInferenceEngine(
-    model="watsonx/meta-llama/llama-3-8b-instruct",
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-8b-instruct",
     max_tokens=30,
-    max_requests_per_second=6,
 )
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
 
 predictions = inference_model.infer(test_dataset)
 evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
diff --git a/examples/evaluate_different_demo_selections.py b/examples/evaluate_different_demo_selections.py
index d494b89f43..7c3c948e03 100644
--- a/examples/evaluate_different_demo_selections.py
+++ b/examples/evaluate_different_demo_selections.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.splitters import CloseTextSampler, FixedIndicesSampler, RandomSampler
 from unitxt.text_utils import print_dict
 
@@ -13,10 +13,16 @@
 # CloseTextSampler - select the lexically closest amples from the demo pool for each test instance
 # FixedIndicesSampler - selec the same fixed set of demo examples for all instances
 
-card = "cards.ledgar"
-model_name = "google/flan-t5-xxl"
-inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", max_tokens=32
+)
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
 
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
 
 df = pd.DataFrame(columns=["num_demos", "sampler", "f1_micro", "ci_low", "ci_high"])
 
@@ -27,8 +33,9 @@
         FixedIndicesSampler(indices=[0, 1]),
     ]:
         dataset = load_dataset(
-            card=card,
+            card="cards.ledgar",
             template="templates.classification.multi_class.title",
+            format="formats.chat_api",
             num_demos=num_demos,
             demos_pool_size=50,
             loader_limit=200,
diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py
index f650e25c12..7605efdf7d 100644
--- a/examples/evaluate_different_formats.py
+++ b/examples/evaluate_different_formats.py
@@ -1,14 +1,23 @@
 import pandas as pd
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.text_utils import print_dict
 
 logger = get_logger()
 
 
-model_name = "meta-llama/llama-3-8b-instruct"
-inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-8b-instruct", max_tokens=32, provider="bam"
+)
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
+
 card = "cards.boolq.classification"
 template = "templates.classification.multi_class.relation.default"
 
diff --git a/examples/evaluate_different_templates.py b/examples/evaluate_different_templates.py
index 15a0d8415a..0ece537a51 100644
--- a/examples/evaluate_different_templates.py
+++ b/examples/evaluate_different_templates.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from unitxt import add_to_catalog, get_logger, register_local_catalog
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.templates import InputOutputTemplate
 from unitxt.text_utils import print_dict
 
@@ -58,10 +58,16 @@ def create_path_and_register_as_local_catalog(path):
 )
 
 # Run inference on mnli (entailment task) on the two templates with both 0 and 3 shot in context learning.
-card = "cards.mnli"
-model_name = "google/flan-t5-xxl"
-inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", max_tokens=32
+)
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
 
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
 
 df = pd.DataFrame(columns=["template", "num_demos", "f1_micro", "ci_low", "ci_high"])
 
@@ -71,18 +77,19 @@ def create_path_and_register_as_local_catalog(path):
 ]:
     for num_demos in [0, 3]:
         dataset = load_dataset(
-            card=card,
+            card="cards.mnli",
             template=template,
+            format="formats.chat_api",
             num_demos=num_demos,
             demos_pool_size=100,
             loader_limit=500,
-            max_test_instances=300,
+            max_test_instances=10,
+            split="test",
         )
 
-        test_dataset = dataset["test"]
+        predictions = inference_model.infer(dataset)
 
-        predictions = inference_model.infer(test_dataset)
-        evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+        evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
         logger.info(
             f"Sample input and output for template '{template}' and num_demos '{num_demos}':"
diff --git a/examples/evaluate_existing_dataset_by_llm_as_judge.py b/examples/evaluate_existing_dataset_by_llm_as_judge.py
index 78e4d8dd24..e1295940fe 100644
--- a/examples/evaluate_existing_dataset_by_llm_as_judge.py
+++ b/examples/evaluate_existing_dataset_by_llm_as_judge.py
@@ -1,43 +1,51 @@
 from unitxt import get_logger, get_settings, load_dataset
 from unitxt.api import evaluate
 from unitxt.inference import (
-    HFPipelineBasedInferenceEngine,
+    CrossProviderInferenceEngine,
 )
 from unitxt.text_utils import print_dict
 
 logger = get_logger()
 settings = get_settings()
-settings.allow_unverified_code = True
 
-# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
-# We set loader_limit to 20 to reduce download time.
-dataset = load_dataset(
-    card="cards.squad",
-    template="templates.qa.with_context.simple",
-    metrics=[
-        "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn"
-    ],
-    loader_limit=20,
-)
-test_dataset = dataset["test"]
+with settings.context(allow_unverified_code=True):
+    # Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
+    # We set loader_limit to 20 to reduce download time.
+    dataset = load_dataset(
+        card="cards.squad",
+        template="templates.qa.with_context.simple",
+        format="formats.chat_api",
+        metrics=[
+            "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn"
+        ],
+        loader_limit=20,
+        max_test_instances=20,
+        split="test",
+    )
 
-# Infer a model to get predictions.
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
-)
-predictions = inference_model.infer(test_dataset)
+    # Infer a model to get predictions.
+    inference_model = CrossProviderInferenceEngine(
+        model="llama-3-2-1b-instruct", provider="watsonx"
+    )
+    """
+    We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+    watsonx, bam, openai, azure, aws and more.
 
-# Evaluate the predictions using the defined metric.
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+    For the arguments these inference engines can receive, please refer to the classes documentation or read
+    about the the open ai api arguments the CrossProviderInferenceEngine follows.
+    """
+    predictions = inference_model.infer(dataset)
 
-print_dict(
-    evaluated_dataset[0],
-    keys_to_print=[
-        "source",
-        "prediction",
-        "processed_prediction",
-        "references",
-        "score",
-    ],
-)
+    # Evaluate the predictions using the defined metric.
+    evaluated_dataset = evaluate(predictions=predictions, data=dataset)
+
+    print_dict(
+        evaluated_dataset[0],
+        keys_to_print=[
+            "source",
+            "prediction",
+            "processed_prediction",
+            "references",
+            "score",
+        ],
+    )
diff --git a/examples/evaluate_existing_dataset_with_install.py b/examples/evaluate_existing_dataset_with_install.py
index 74389fdb23..5ee72ccae1 100644
--- a/examples/evaluate_existing_dataset_with_install.py
+++ b/examples/evaluate_existing_dataset_with_install.py
@@ -1,49 +1,34 @@
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.text_utils import print_dict
 
 # Use the Unitxt APIs to load the wnli entailment dataset using the standard template in the catalog for relation task with 2-shot in-context learning.
 # We set loader_limit to 20 to limit reduce inference time.
 dataset = load_dataset(
     card="cards.wnli",
+    system_prompt="system_prompts.be_concise",
     template="templates.classification.multi_class.relation.default",
+    format="formats.chat_api",
     num_demos=2,
     demos_pool_size=10,
     loader_limit=20,
+    split="test",
 )
-
-test_dataset = dataset["test"]
-
-# Infer using flan t5 base using HF API, can be replaced with any
-# inference code.
-#
-# change to this to infer with IbmGenAI APIs:
-#
-# from unitxt.inference import IbmGenAiInferenceEngine
-# inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# or this to infer using WML APIs:
-#
-# from unitxt.inference import WMLInferenceEngine
-# inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# or to this to infer using OpenAI APIs:
-#
-# from unitxt.inference import OpenAiInferenceEngine
-# inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# Note that to run with OpenAI APIs you need to change the loader specification, to
-# define that your data can be sent to a public API:
-#
 # loader=LoadFromDictionary(data=data,data_classification_policy=["public"]),
 
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", provider="watsonx"
 )
-predictions = inference_model.infer(test_dataset)
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
+predictions = inference_model.infer(dataset)
 
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 print_dict(
diff --git a/examples/evaluate_image_text_to_text.py b/examples/evaluate_image_text_to_text.py
index 1edbe02e62..6d8a209f97 100644
--- a/examples/evaluate_image_text_to_text.py
+++ b/examples/evaluate_image_text_to_text.py
@@ -15,11 +15,11 @@
         split="test",
     )
 
-    inference_model = HFLlavaInferenceEngine(
+    engine = HFLlavaInferenceEngine(
         model_name="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=32
     )
 
-    predictions = inference_model.infer(dataset)
+    predictions = engine.infer(dataset)
     evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
     print_dict(
diff --git a/examples/evaluate_image_text_to_text_lmms_eval_inference.py b/examples/evaluate_image_text_to_text_lmms_eval_inference.py
index 2d2ce4be49..458a6c4717 100644
--- a/examples/evaluate_image_text_to_text_lmms_eval_inference.py
+++ b/examples/evaluate_image_text_to_text_lmms_eval_inference.py
@@ -17,6 +17,7 @@
     dataset = load_dataset(
         card="cards.seed_bench",
         template="templates.qa.multiple_choice.with_context.lmms_eval",
+        format="formats.chat_api",
         loader_limit=30,
         split="test",
     )
diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py
index 6b74e1cd1c..dd9e9cb496 100644
--- a/examples/evaluate_rag_response_generation.py
+++ b/examples/evaluate_rag_response_generation.py
@@ -60,18 +60,25 @@
 )
 
 # Verbalize the dataset using the template
-dataset = load_dataset(card=card, template_card_index="simple")
-test_dataset = dataset["test"]
+dataset = load_dataset(
+    card=card,
+    template_card_index="simple",
+    format="formats.chat_api",
+    split="test",
+    max_test_instances=10,
+)
 
 
-# Infere using flan t5 base using HF API
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
 
-predictions = inference_model.infer(test_dataset)
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+predictions = engine.infer(dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 for instance in evaluated_dataset:
diff --git a/examples/evaluate_summarization_dataset_llm_as_judge.py b/examples/evaluate_summarization_dataset_llm_as_judge.py
index ab2b6545c0..305966c733 100644
--- a/examples/evaluate_summarization_dataset_llm_as_judge.py
+++ b/examples/evaluate_summarization_dataset_llm_as_judge.py
@@ -1,6 +1,7 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
 from unitxt.inference import (
+    CrossProviderInferenceEngine,
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.llm_as_judge import LLMAsJudge
@@ -8,15 +9,18 @@
 from unitxt.text_utils import print_dict
 
 logger = get_logger()
+
 # First, we define the judge template.
 judge_summary_rating_template = InputOutputTemplate(
-    instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
-    'You must respond according the following format: "[[rate]] - explanation".\n'
-    'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
-    "The explanation describe shortly why you decided to give the rank you chosen.\n"
-    "Please make sure to start with your rank ([[rank]]) before anything else.\n"
-    "For example: [[9]] The summary catches the main text ideas."
-    ".\n\n",
+    instruction=(
+        "Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n"
+        'You must respond according the following format: "[[rate]] - explanation".\n'
+        'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n'
+        "The explanation describe shortly why you decided to give the rank you chosen.\n"
+        "Please make sure to start with your rank ([[rank]]) before anything else.\n"
+        "For example: [[9]] The summary catches the main text ideas."
+        ".\n\n"
+    ),
     input_format="[Text:\n{question}\n\n" "Assistant's summary:\n{answer}\n",
     output_format="[[{rating}]]",
     postprocessors=[
@@ -24,24 +28,19 @@
     ],
 )
 
-# Second, we define the inference engine we use for judge, with the preferred model and platform.
-platform = "hf"
-model_name = "google/flan-t5-large"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=256, use_fp16=True
+# Second, we define the inference engine we use for judge, with the preferred model and provider.
+# You can change the provider to any of: "watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-8b-instruct", provider="watsonx"
 )
-# change to this to infer with IbmGenAI APIs:
-#
-# platform = 'ibm_gen_ai'
-# model_name = 'meta-llama/llama-3-70b-instruct'
-# inference_model = IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", max_new_tokens=512)
 
 # Third, We define the metric as LLM as a judge, with the desired platform and model.
 llm_judge_metric = LLMAsJudge(
     inference_model=inference_model,
     template=judge_summary_rating_template,
+    format="formats.chat_api",
     task="rating.single_turn",
-    main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
+    main_score="llm_judge_llama_3_8b",
     strip_system_prompt_and_format_from_inputs=False,
 )
 
@@ -51,19 +50,21 @@
     template="templates.summarization.abstractive.formal",
     metrics=[llm_judge_metric],
     loader_limit=5,
+    split="test",
 )
 
-test_dataset = dataset["test"]
-
-# Infer a model to get predictions.
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
-predictions = inference_model.infer(test_dataset)
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+
+predictions = engine.infer(dataset)
 
 # Evaluate the predictions using the defined metric.
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 print_dict(
@@ -106,7 +107,7 @@
     inference_model=inference_model,
     template=judge_summary_rating_with_reference_template,
     task="rating.single_turn_with_reference",
-    main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
+    main_score="llm_judge_llama_3_2_1b_hf",
     single_reference_per_prediction=True,
     strip_system_prompt_and_format_from_inputs=False,
 )
@@ -115,21 +116,24 @@
 dataset = load_dataset(
     card="cards.xsum",
     template="templates.summarization.abstractive.formal",
+    format="formats.chat_api",
     metrics=[llm_judge_with_summary_metric],
     loader_limit=5,
+    split="test",
 )
 
-test_dataset = dataset["test"]
-
-# Infer a model to get predictions.
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
-predictions = inference_model.infer(test_dataset)
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+
+predictions = engine.infer(dataset)
 
 # Evaluate the predictions using the defined metric.
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 print_dict(
diff --git a/examples/evaluate_using_metrics_ensemble.py b/examples/evaluate_using_metrics_ensemble.py
index ee99ec8de8..2422824f20 100644
--- a/examples/evaluate_using_metrics_ensemble.py
+++ b/examples/evaluate_using_metrics_ensemble.py
@@ -21,20 +21,25 @@
 dataset = load_dataset(
     card="cards.squad",
     template="templates.qa.with_context.simple",
+    format="formats.chat_api",
     metrics=[ensemble_metric],
     loader_limit=20,
+    max_test_instances=10,
+    split="test",
 )
-test_dataset = dataset["test"]
 
-# Infer a model to get predictions.
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
-predictions = inference_model.infer(test_dataset)
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+
+predictions = engine.infer(dataset)
 
 # Evaluate the predictions using the defined metric.
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 for instance in evaluated_dataset:
diff --git a/examples/qa_evaluation.py b/examples/qa_evaluation.py
index fb8bbfff78..4771036aa2 100644
--- a/examples/qa_evaluation.py
+++ b/examples/qa_evaluation.py
@@ -39,25 +39,26 @@
 #  What is the color of the sky?
 #  Answer:
 # "
-dataset = load_dataset(card=card, template="templates.qa.open.title")
-test_dataset = dataset["test"]
+dataset = load_dataset(
+    card=card,
+    template="templates.qa.open.title",
+    format="formats.chat_api",
+    split="test",
+    max_test_instances=5,
+)
 
 
-# Infer using flan t5 base using HF API
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
-# change to this to infer with IbmGenAI APIs:
-#
-# inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# or to this to infer using OpenAI APIs:
-#
-# inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-predictions = inference_model.infer(test_dataset)
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+
+
+predictions = engine.infer(dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 for instance in evaluated_dataset:
diff --git a/examples/run_generic_inference_engine.py b/examples/run_generic_inference_engine.py
deleted file mode 100644
index b234e64677..0000000000
--- a/examples/run_generic_inference_engine.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from unitxt import get_logger, produce  # Import necessary functions from unitxt
-from unitxt.inference import GenericInferenceEngine  # Import the inference engine class
-
-if __name__ == "__main__":
-    # Create an instance of the GenericInferenceEngine with a default engine.
-    # This means if no engine is specified during inference, it will default to this one.
-    generic_engine_with_default = GenericInferenceEngine(
-        default="engines.ibm_gen_ai.llama_3_70b_instruct"
-    )
-
-    # Define the recipe for data processing and model selection.
-    # - card: Specifies the underlying data (from cards.almost_evil).
-    # - template: Selects the specific template within the card (from templates.qa.open.simple).
-    # - demos_pool_size and num_demos: Control the number of demonstration examples used (set to 0 here).
-    recipe = "card=cards.almost_evil,template=templates.qa.open.simple,demos_pool_size=0,num_demos=0"
-
-    # Create a list of instances (data points) for inference.
-    # Each instance has a "question" and its corresponding "answers".
-    instances = [
-        {
-            "question": "How many days there are in a week, answer only with numerals",
-            "answers": ["7"],
-        },
-        {
-            "question": "If a ate an apple in the morning, and one in the evening, what is the number of apples I have eaten?, answer only with numerals",
-            "answers": ["2"],
-        },
-    ]
-
-    # Process the instances using the defined recipe.
-    # This likely formats the data according to the chosen card and template.
-    dataset = produce(instances, recipe)
-
-    # Perform inference on the processed dataset using the engine with the default model.
-    predictions = generic_engine_with_default.infer(dataset)
-    get_logger().info(predictions)  # Log the predictions
-
-    # The following code block demonstrates how to use the GenericInferenceEngine without specifying a
-    # default engine. It expects the engine to be defined in the UNITXT_INFERENCE_ENGINE environment variable.
-    try:
-        # Attempt to create an instance without a default engine.
-        generic_engine_without_default = GenericInferenceEngine()
-
-        # Perform inference (will use the engine specified in the environment variable).
-        predictions = generic_engine_without_default.infer(dataset)
-        get_logger().info(predictions)  # Log the predictions
-    except:
-        # Handle the case where the environment variable is not set.
-        get_logger().error(
-            "GenericInferenceEngine could not be initialized without a default since "
-            "UNITXT_INFERENCE_ENGINE environmental variable is not set."
-        )
diff --git a/examples/standalone_evaluation_llm_as_judge.py b/examples/standalone_evaluation_llm_as_judge.py
index 1561d4d296..18b91b9b7b 100644
--- a/examples/standalone_evaluation_llm_as_judge.py
+++ b/examples/standalone_evaluation_llm_as_judge.py
@@ -57,21 +57,22 @@
 )
 
 platform = "hf"
-model_name = "google/flan-t5-large"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=256, use_fp16=True
+model_name = "meta-llama/Llama-3.2-1B"
+
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
-# change to this to infer with IbmGenAI APIs:
-#
-# platform = 'ibm_gen_ai'
-# model_name = 'meta-llama/llama-3-70b-instruct'
-# inference_model = IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", max_new_tokens=32)
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
 
 
 # Third, We define the metric as LLM as a judge, with the desired platform and model.
 llm_judge_metric = LLMAsJudge(
-    inference_model=inference_model,
+    inference_model=engine,
     template=judge_correctness_template,
+    format="formats.chat_api",
     task="rating.single_turn",
     main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}",
     strip_system_prompt_and_format_from_inputs=False,
@@ -98,18 +99,22 @@
 )
 
 # Convert card to a dataset
-dataset = load_dataset(card=card, template_card_index="simple")
-test_dataset = dataset["test"]
+dataset = load_dataset(
+    card=card,
+    template_card_index="simple",
+    format="formats.chat_api",
+    split="test",
+    max_test_instances=10,
+)
 
-# Infer a model to get predictions.
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
-predictions = inference_model.infer(test_dataset)
+predictions = engine.infer(dataset)
 
 # Evaluate the predictions using the defined metric.
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 for instance in evaluated_dataset:
diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py
index 2daf6c96b9..cdbca8838b 100644
--- a/examples/standalone_qa_evaluation.py
+++ b/examples/standalone_qa_evaluation.py
@@ -39,38 +39,26 @@
     postprocessors=["processors.lower_case"],
 )
 # Verbalize the dataset using the template
-dataset = load_dataset(card=card, template=template)
-test_dataset = dataset["test"]
+dataset = load_dataset(
+    card=card,
+    template=template,
+    format="formats.chat_api",
+    split="test",
+    max_test_instances=10,
+)
 
 
-# Infere using flan t5 base using HF API
-model_name = "google/flan-t5-base"
-inference_model = HFPipelineBasedInferenceEngine(
-    model_name=model_name, max_new_tokens=32
+# Infer using Llama-3.2-1B base using HF API
+engine = HFPipelineBasedInferenceEngine(
+    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
 )
+# Change to this to infer with external APIs:
+# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
 
-# change to this to infer with IbmGenAI APIs:
-#
-# from unitxt.inference import IbmGenAiInferenceEngine
-# inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# or this to infer using WML APIs:
-#
-# from unitxt.inference import WMLInferenceEngine
-# inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# or to this to infer using OpenAI APIs:
-#
-# from unitxt.inference import OpenAiInferenceEngine
-# inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
-#
-# Note that to run with OpenAI APIs you need to change the loader specification, to
-# define that your data can be sent to a public API:
-#
-# loader=LoadFromDictionary(data=data,data_classification_policy=["public"]),
 
-predictions = inference_model.infer(test_dataset)
-evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+predictions = engine.infer(dataset)
+evaluated_dataset = evaluate(predictions=predictions, data=dataset)
 
 # Print results
 for instance in evaluated_dataset:
diff --git a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py
index b1ed0ad900..e6e596efad 100644
--- a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py
+++ b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py
@@ -1,8 +1,8 @@
 from unitxt import add_to_catalog
 from unitxt.inference import (
+    CrossProviderInferenceEngine,
     GenericInferenceEngine,
     IbmGenAiInferenceEngine,
-    LiteLLMInferenceEngine,
     WMLInferenceEngine,
 )
 from unitxt.llm_as_judge import LLMAsJudge
@@ -64,8 +64,8 @@
 
 add_to_catalog(
     LLMAsJudge(
-        inference_model=LiteLLMInferenceEngine(
-            model="watsonx/meta-llama/llama-3-70b-instruct",
+        inference_model=CrossProviderInferenceEngine(
+            model="llama-3-70b-instruct",
             max_tokens=30,
         ),
         template="templates.response_assessment.pairwise_comparative_rating.arena_hard",
@@ -73,6 +73,21 @@
         format="formats.chat_api",
         main_score="llama_3_70b_instruct_template_arena_hard",
     ),
-    "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.watsonx.template_arena_hard",
+    "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.template_arena_hard",
+    overwrite=True,
+)
+
+add_to_catalog(
+    LLMAsJudge(
+        inference_model=CrossProviderInferenceEngine(
+            model="llama-3-8b-instruct",
+            max_tokens=30,
+        ),
+        template="templates.response_assessment.pairwise_comparative_rating.arena_hard",
+        task="pairwise_comparative_rating.single_turn",
+        format="formats.chat_api",
+        main_score="llama_3_70b_instruct_template_arena_hard",
+    ),
+    "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct.template_arena_hard",
     overwrite=True,
 )
diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
index 931c17cac0..71d37d916e 100644
--- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
+++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py
@@ -1,49 +1,75 @@
 from unitxt import add_to_catalog
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine, IbmGenAiInferenceEngine
 from unitxt.llm_as_judge import LLMAsJudge
 from unitxt.random_utils import get_seed
 
-model = "meta-llama/llama-3-70b-instruct"
-format = "formats.llama3_instruct"
-template = "templates.response_assessment.rating.generic_single_turn"
-
 inference_model = IbmGenAiInferenceEngine(
-    model_name=model, max_new_tokens=252, random_seed=get_seed()
+    model_name="meta-llama/llama-3-70b-instruct",
+    max_new_tokens=252,
+    random_seed=get_seed(),
 )
-model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower()
-model_label = f"{model_label}_ibm_genai"
-template_label = template.split(".")[-1]
-metric_label = f"{model_label}_template_{template_label}"
+
 metric = LLMAsJudge(
     inference_model=inference_model,
-    template=template,
+    template="templates.response_assessment.rating.generic_single_turn",
     task="rating.single_turn",
-    format=format,
-    main_score=metric_label,
+    format="formats.llama3_instruct",
+    main_score="llama_3_70b_instruct_ibm_genai_template_generic_single_turn",
     prediction_type=str,
 )
 
 add_to_catalog(
     metric,
-    f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
+    "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn",
+    overwrite=True,
+)
+
+metric = LLMAsJudge(
+    inference_model=inference_model,
+    template="templates.response_assessment.rating.generic_single_turn_with_reference",
+    task="rating.single_turn_with_reference",
+    format="formats.llama3_instruct",
+    single_reference_per_prediction=True,
+    main_score="llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference",
+)
+
+add_to_catalog(
+    metric,
+    "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference",
     overwrite=True,
 )
 
 
-template = "templates.response_assessment.rating.generic_single_turn_with_reference"
-template_label = template.split(".")[-1]
-metric_label = f"{model_label}_template_{template_label}"
+inference_model = CrossProviderInferenceEngine(
+    model="llama-3-70b-instruct", max_tokens=252
+)
+
+metric = LLMAsJudge(
+    inference_model=inference_model,
+    template="templates.response_assessment.rating.generic_single_turn",
+    task="rating.single_turn",
+    format="formats.chat_api",
+    main_score="llama_3_70b_instruct_template_generic_single_turn",
+    prediction_type=str,
+)
+
+add_to_catalog(
+    metric,
+    "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
+    overwrite=True,
+)
+
 metric = LLMAsJudge(
     inference_model=inference_model,
-    template=template,
+    template="templates.response_assessment.rating.generic_single_turn_with_reference",
     task="rating.single_turn_with_reference",
-    format=format,
+    format="formats.chat_api",
     single_reference_per_prediction=True,
-    main_score=metric_label,
+    main_score="llama_3_70b_instruct_template_generic_single_turn_with_reference",
 )
 
 add_to_catalog(
     metric,
-    f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}",
+    "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn_with_reference",
     overwrite=True,
 )
diff --git a/prepare/recipes/bluebench.py b/prepare/recipes/bluebench.py
index fe513f09b9..c51a31aa85 100644
--- a/prepare/recipes/bluebench.py
+++ b/prepare/recipes/bluebench.py
@@ -129,7 +129,7 @@ def prepare_recipe(default_args, specific_args):
     "num_demos": 0,
     "template": "templates.empty",
     "metrics": [
-        "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.watsonx.template_arena_hard"
+        "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.template_arena_hard"
     ],
 }
 recipe = prepare_recipe(default_args, ingridients)
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/watsonx/template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/template_arena_hard.json
similarity index 75%
rename from src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/watsonx/template_arena_hard.json
rename to src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/template_arena_hard.json
index 8e70b18cb0..e60b9b895d 100644
--- a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/watsonx/template_arena_hard.json
+++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/template_arena_hard.json
@@ -1,8 +1,8 @@
 {
     "__type__": "llm_as_judge",
     "inference_model": {
-        "__type__": "lite_llm_inference_engine",
-        "model": "watsonx/meta-llama/llama-3-70b-instruct",
+        "__type__": "cross_provider_inference_engine",
+        "model": "llama-3-70b-instruct",
         "max_tokens": 30
     },
     "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json
new file mode 100644
index 0000000000..beec420a7f
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json
@@ -0,0 +1,12 @@
+{
+    "__type__": "llm_as_judge",
+    "inference_model": {
+        "__type__": "cross_provider_inference_engine",
+        "model": "llama-3-8b-instruct",
+        "max_tokens": 30
+    },
+    "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard",
+    "task": "pairwise_comparative_rating.single_turn",
+    "format": "formats.chat_api",
+    "main_score": "llama_3_70b_instruct_template_arena_hard"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json
new file mode 100644
index 0000000000..ff1f9e2169
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "llm_as_judge",
+    "inference_model": {
+        "__type__": "cross_provider_inference_engine",
+        "model": "llama-3-70b-instruct",
+        "max_tokens": 252
+    },
+    "template": "templates.response_assessment.rating.generic_single_turn",
+    "task": "rating.single_turn",
+    "format": "formats.chat_api",
+    "main_score": "llama_3_70b_instruct_template_generic_single_turn",
+    "prediction_type": "str"
+}
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json
new file mode 100644
index 0000000000..24b17f145f
--- /dev/null
+++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json
@@ -0,0 +1,13 @@
+{
+    "__type__": "llm_as_judge",
+    "inference_model": {
+        "__type__": "cross_provider_inference_engine",
+        "model": "llama-3-70b-instruct",
+        "max_tokens": 252
+    },
+    "template": "templates.response_assessment.rating.generic_single_turn_with_reference",
+    "task": "rating.single_turn_with_reference",
+    "format": "formats.chat_api",
+    "single_reference_per_prediction": true,
+    "main_score": "llama_3_70b_instruct_template_generic_single_turn_with_reference"
+}
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index e3e5f10d19..9a5d616714 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -221,6 +221,7 @@ class HFPipelineBasedInferenceEngine(
     model_name: str
     max_new_tokens: int
     use_fp16: bool = True
+    batch_size: int = 1
 
     _requirements_list = {
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
@@ -229,9 +230,20 @@ class HFPipelineBasedInferenceEngine(
     def get_engine_id(self):
         return get_model_and_label_id(self.model_name, "hf_pipeline")
 
+    def _get_task(self):
+        from transformers import AutoConfig
+
+        return (
+            "text2text-generation"
+            if AutoConfig.from_pretrained(
+                self.model_name, trust_remote_code=True
+            ).is_encoder_decoder
+            else "text-generation"
+        )
+
     def _prepare_pipeline(self):
         import torch
-        from transformers import AutoConfig, pipeline
+        from transformers import pipeline
 
         model_args: Dict[str, Any] = (
             {"torch_dtype": torch.float16} if self.use_fp16 else {}
@@ -254,13 +266,7 @@ def _prepare_pipeline(self):
         else:
             model_args.update({"device": device})
 
-        task = (
-            "text2text-generation"
-            if AutoConfig.from_pretrained(
-                self.model_name, trust_remote_code=True
-            ).is_encoder_decoder
-            else "text-generation"
-        )
+        task = self._get_task()
 
         if task == "text-generation":
             model_args.update({"return_full_text": False})
@@ -281,13 +287,16 @@ def _infer(
         dataset: Union[List[Dict[str, Any]], DatasetDict],
         return_meta_data: bool = False,
     ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
-        self.verify_not_chat_api(dataset)
+        if self._get_task() == "text2text-generation":
+            self.verify_not_chat_api(dataset)
 
         if not self._is_loaded():
             self._prepare_pipeline()
 
         outputs = []
-        for output in self.model([instance["source"] for instance in dataset]):
+        for output in self.model(
+            [instance["source"] for instance in dataset], batch_size=self.batch_size
+        ):
             if isinstance(output, list):
                 output = output[0]
             outputs.append(output["generated_text"])
@@ -1649,7 +1658,7 @@ async def _infer_async(
         ]
         # Use tqdm_asyncio.gather to display progress bar
         return await tqdm_asyncio.gather(
-            *tasks, desc="LiteLLM Inference", total=len(tasks)
+            *tasks, desc=f"LiteLLM Inference ({self.model})", total=len(tasks)
         )
 
     def _infer(
@@ -1681,9 +1690,9 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     user requests.
 
     Attributes:
-        api: Optional; Specifies the current API in use. Must be one of the
+        provider: Optional; Specifies the current API in use. Must be one of the
             literals in `_supported_apis`.
-        api_model_map: Dictionary mapping each supported API to a corresponding
+        provider_model_map: Dictionary mapping each supported API to a corresponding
             model identifier string. This mapping allows consistent access to models
             across different API backends.
     """
@@ -1695,10 +1704,13 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct",
             "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct",
             "granite-3-8b-instruct": "watsonx/ibm/granite-3-8b-instruct",
+            "flan-t5-xxl": "watsonx/google/flan-t5-xxl",
+            "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct",
         },
         "together-ai": {
             "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct",
             "llama-3-70b-instruct": "together_ai/togethercomputer/llama-3-70b-instruct",
+            "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
         },
         "aws": {
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
@@ -1711,6 +1723,8 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
             "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct",
+            "llama-3-2-1b-instruct": "meta-llama/llama-3-2-1b-instruct",
+            "flan-t5-xxl": "google/flan-t5-xxl",
         },
     }
 
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
index eedf4f98be..49e21fffed 100644
--- a/tests/examples/test_examples.py
+++ b/tests/examples/test_examples.py
@@ -24,11 +24,11 @@
     # "use_llm_as_judge_metric.py",
     # "standalone_evaluation_llm_as_judge.py",
     # "evaluate_summarization_dataset_llm_as_judge.py",
-    "evaluate_different_formats.py",
-    "evaluate_different_templates.py",
-    "evaluate_different_demo_selections.py",
-    "evaluate_a_judge_model_capabilities_on_arena_hard.py",
-    "evaluate_a_model_using_arena_hard.py",
+    # "evaluate_different_formats.py",
+    # "evaluate_different_templates.py",
+    # "evaluate_different_demo_selections.py",
+    # "evaluate_a_judge_model_capabilities_on_arena_hard.py",
+    # "evaluate_a_model_using_arena_hard.py",
     # "evaluate_llm_as_judge.py",
     "evaluate_using_metrics_ensemble.py",
     "evaluate_existing_dataset_no_install.py",
@@ -43,8 +43,8 @@
     # "robustness_testing_for_vision_text_models.py",
     "evaluate_bluebench.py",
     "custom_type.py",
-    "evaluate_different_templates_num_demos.py",
-    "evaluate_existing_dataset_with_install.py",
+    # "evaluate_different_templates_num_demos.py",
+    # "evaluate_existing_dataset_with_install.py",
     "evaluate_batched_multiclass_classification.py",
 ]
 
diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index a1f81495a8..9ad5113e4a 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -232,3 +232,23 @@ def test_option_selecting_inference_engine_chat_api(self):
 
         self.assertEqual(predictions[0], "hello friend")
         self.assertEqual(predictions[1], "white.")
+
+    def test_hugginface_pipeline_inference_engine_chat_api(self):
+        dataset = [
+            {
+                "source": [{"role": "user", "content": "hi you!"}],
+            },
+            {
+                "source": [{"role": "user", "content": "black or white?"}],
+            },
+        ]
+
+        engine = HFPipelineBasedInferenceEngine(
+            model_name="Qwen/Qwen2.5-0.5B-Instruct",
+            batch_size=1,
+            max_new_tokens=1,
+        )
+        predictions = engine.infer(dataset)
+
+        self.assertEqual(predictions[0], "Hello")
+        self.assertEqual(predictions[1], "As")

From 4fa6f8e9a5a0ea8c78bb5977d59ae4d7ebb577ac Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 10:59:17 +0200
Subject: [PATCH 18/26] Add vllm inference engine

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 src/unitxt/inference.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 9a5d616714..39c3d1e1d5 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1556,6 +1556,37 @@ def _infer(
         return optimal_responses
 
 
+class VLLMInferenceEngine(
+    InferenceEngine, PackageRequirementsMixin, StandardAPIParamsMixin
+):
+    def prepare_engine(self):
+        from vllm import LLM, SamplingParams
+
+        args = self.to_dict([StandardAPIParamsMixin])
+        self.sampling_params = SamplingParams(**args)
+        self.llm = LLM(model=self.model)
+
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], DatasetDict],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        inputs = []
+        for instance in dataset:
+            inputs.append(instance["source"])
+
+        if isinstance(inputs[0], list):
+            outputs = self.llm.chat(inputs, self.sampling_params)
+        else:
+            outputs = self.llm.generate(inputs, self.sampling_params)
+
+        predictions = []
+        for output in outputs:
+            predictions.append(output.outputs[0].text)
+
+        return predictions
+
+
 class AsyncTokenBucket:
     def __init__(self, rate, capacity):
         self.rate = rate  # Tokens added per second

From 81150912c319371acb88549ec7c7f32718830f02 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 11:00:55 +0200
Subject: [PATCH 19/26] Fix blue bench to use cross provider engine

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 .../arena_hard_generation_english_gpt_4_0314_reference.json     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json b/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json
index 64871f5486..577d51e38d 100644
--- a/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json
+++ b/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json
@@ -9,7 +9,7 @@
     "card": "cards.arena_hard.generation.english_gpt_4_0314_reference",
     "template": "templates.empty",
     "metrics": [
-        "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.watsonx.template_arena_hard"
+        "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.template_arena_hard"
     ],
     "format": "formats.chat_api"
 }

From 728fcc301d315b649abe30ac52a1aef98b460d48 Mon Sep 17 00:00:00 2001
From: Yoav Katz <katz@il.ibm.com>
Date: Tue, 19 Nov 2024 11:42:42 +0200
Subject: [PATCH 20/26] Added watsonx-sdk to MultiProviderInferenceEngine

Add example to evaluate same datasets  and models with multiple providers and formats

Signed-off-by: Yoav Katz <katz@il.ibm.com>
---
 ...sets_and_models_with_multiple_providers.py | 101 ++++++++++++++++++
 src/unitxt/inference.py                       |  13 ++-
 2 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 examples/evaluate_same_datasets_and_models_with_multiple_providers.py

diff --git a/examples/evaluate_same_datasets_and_models_with_multiple_providers.py b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
new file mode 100644
index 0000000000..6f25b6e9a8
--- /dev/null
+++ b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
@@ -0,0 +1,101 @@
+import pandas as pd
+from unitxt import get_logger
+from unitxt.api import evaluate, load_dataset
+from unitxt.artifact import fetch_artifact
+from unitxt.formats import SystemFormat
+from unitxt.text_utils import print_dict
+
+logger = get_logger()
+
+df = pd.DataFrame(
+    columns=[
+        "provider",
+        "model",
+        "format_as_chat_api",
+        "num_instances",
+        "score_name",
+        "score",
+        "ci_low",
+        "ci_high",
+    ]
+)
+
+for provider in [
+    "watsonx-sdk",
+    "watsonx",
+]:
+    for model_name in [
+        "granite-3-8b-instruct",
+        "llama-3-8b-instruct",
+    ]:
+        for format_as_chat_api in [True, False]:
+            if format_as_chat_api and provider == "watsonx-sdk":
+                continue
+            if format_as_chat_api:
+                format = "formats.chat_api"
+            else:
+                if model_name.startswith("llama"):
+                    format = "formats.llama3_instruct"
+                if model_name.startswith("granite"):
+                    format = SystemFormat(
+                        demo_format=(
+                            "{instruction}\\N{source}\\N<|end_of_text|>\n"
+                            "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n"
+                            "<|start_of_role|>user<|end_of_role|>"
+                        ),
+                        model_input_format=(
+                            "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n"
+                            "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n"
+                            "<|start_of_role|>assistant<|end_of_role|>"
+                        ),
+                    )
+            card, _ = fetch_artifact("cards.sst2")
+
+            dataset = load_dataset(
+                card=card,
+                template_card_index=0,
+                format=format,
+                num_demos=1,
+                demos_pool_size=100,
+                loader_limit=1000,
+                max_test_instances=500,
+                disable_cache=False,
+            )
+
+            test_dataset = dataset["test"]
+            from unitxt.inference import CrossProviderInferenceEngine
+
+            inference_model = CrossProviderInferenceEngine(
+                model=model_name, max_tokens=1024, provider=provider
+            )
+            predictions = inference_model.infer(test_dataset)
+
+            evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+            # import pandas as pd
+            # result_df = pd.json_normalize(evaluated_dataset)
+            # result_df.to_csv(f"output.csv")
+            # Print results
+            print_dict(
+                evaluated_dataset[0],
+                keys_to_print=[
+                    "source",
+                    "prediction",
+                    "processed_prediction",
+                    "processed_references",
+                ],
+            )
+
+            global_scores = evaluated_dataset[0]["score"]["global"]
+            df.loc[len(df)] = [
+                provider,
+                model_name,
+                format_as_chat_api,
+                global_scores["num_of_instances"],
+                global_scores["score_name"],
+                global_scores["score"],
+                global_scores["score_ci_low"],
+                global_scores["score_ci_high"],
+            ]
+
+            df = df.round(decimals=2)
+            logger.info(df.to_markdown())
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 1748f71f3d..f1f0b7c739 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -1708,7 +1708,9 @@ def _infer(
         return [response.prediction for response in responses]
 
 
-_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+_supported_apis = Literal[
+    "watsonx", "together-ai", "open-ai", "aws", "ollama", "bam", "watsonx-sdk"
+]
 
 
 class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
@@ -1739,6 +1741,11 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "flan-t5-xxl": "watsonx/google/flan-t5-xxl",
             "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct",
         },
+        "watsonx-sdk": {
+            "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct",
+            "llama-3-70b-instruct": "meta-llama/llama-3-70b-instruct",
+            "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
+        },
         "together-ai": {
             "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct",
             "llama-3-70b-instruct": "together_ai/togethercomputer/llama-3-70b-instruct",
@@ -1767,10 +1774,12 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "aws": LiteLLMInferenceEngine,
         "ollama": OllamaInferenceEngine,
         "bam": IbmGenAiInferenceEngine,
+        "watsonx-sdk": WMLInferenceEngine,
     }
 
     _provider_param_renaming = {
-        "bam": {"max_tokens": "max_new_tokens", "model": "model_name"}
+        "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
+        "watsonx-sdk": {"max_tokens": "max_new_tokens", "model": "model_name"},
     }
 
     def get_provider_name(self):

From 9414f54e690d838a0c3c33e92fc226674d73648e Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 14:15:56 +0200
Subject: [PATCH 21/26] Make hf tests deterministic

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 src/unitxt/inference.py                  | 5 ++++-
 tests/inference/test_inference_engine.py | 5 +++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index f1f0b7c739..793331d6cc 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -223,6 +223,7 @@ class HFPipelineBasedInferenceEngine(
     max_new_tokens: int
     use_fp16: bool = True
     batch_size: int = 1
+    top_k: Optional[int] = None
 
     _requirements_list = {
         "transformers": "Install huggingface package using 'pip install --upgrade transformers"
@@ -296,7 +297,9 @@ def _infer(
 
         outputs = []
         for output in self.model(
-            [instance["source"] for instance in dataset], batch_size=self.batch_size
+            [instance["source"] for instance in dataset],
+            batch_size=self.batch_size,
+            top_k=self.top_k,
         ):
             if isinstance(output, list):
                 output = output[0]
diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index 9ad5113e4a..c6f1eee1b9 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -234,6 +234,8 @@ def test_option_selecting_inference_engine_chat_api(self):
         self.assertEqual(predictions[1], "white.")
 
     def test_hugginface_pipeline_inference_engine_chat_api(self):
+        from transformers import set_seed
+
         dataset = [
             {
                 "source": [{"role": "user", "content": "hi you!"}],
@@ -243,10 +245,13 @@ def test_hugginface_pipeline_inference_engine_chat_api(self):
             },
         ]
 
+        set_seed(0, deterministic=True)
+
         engine = HFPipelineBasedInferenceEngine(
             model_name="Qwen/Qwen2.5-0.5B-Instruct",
             batch_size=1,
             max_new_tokens=1,
+            top_k=1,
         )
         predictions = engine.infer(dataset)
 

From 69388b5dcabbdbfe2d3c8f59220b895ededfeb68 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 14:59:25 +0200
Subject: [PATCH 22/26] Fix llmaj with chat api

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 .../standalone_evaluation_llm_as_judge.py     |  4 +--
 src/unitxt/llm_as_judge.py                    | 26 ++++++++++++++++---
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/examples/standalone_evaluation_llm_as_judge.py b/examples/standalone_evaluation_llm_as_judge.py
index 18b91b9b7b..b17b3330fa 100644
--- a/examples/standalone_evaluation_llm_as_judge.py
+++ b/examples/standalone_evaluation_llm_as_judge.py
@@ -61,7 +61,7 @@
 
 # Infer using Llama-3.2-1B base using HF API
 engine = HFPipelineBasedInferenceEngine(
-    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
+    model_name="Qwen/Qwen1.5-0.5B-Chat", max_new_tokens=32
 )
 # Change to this to infer with external APIs:
 # CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
@@ -109,7 +109,7 @@
 
 # Infer using Llama-3.2-1B base using HF API
 engine = HFPipelineBasedInferenceEngine(
-    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
+    model_name="Qwen/Qwen1.5-0.5B-Chat", max_new_tokens=32
 )
 predictions = engine.infer(dataset)
 
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
index 225a284cc3..190659ba01 100644
--- a/src/unitxt/llm_as_judge.py
+++ b/src/unitxt/llm_as_judge.py
@@ -176,6 +176,26 @@ def _get_input_instances(self, task_data: List[Dict]) -> List:
     def _get_instance_for_judge_model(
         self, input_instances: List[str], predictions: List, references: List
     ) -> List[Dict]:
+        string_input_instances = []
+
+        for input_instance in input_instances:
+            if isinstance(input_instance, str):
+                string_input_instances.append(input_instance)
+            if isinstance(input_instance, list):  # chat api
+                if len(input_instance) == 1:  # only user
+                    string_input_instances.append(input_instance[0]["content"])
+                if len(input_instance) == 2:  # only system and user
+                    string_input_instances.append(
+                        input_instance[0]["content"]
+                        + "\n"
+                        + input_instance[1]["content"]
+                    )
+                else:  # num demos > 0
+                    turns = []
+                    for turn in input_instance:
+                        turns.append(f'{turn["role"]}: {turn["content"]}')
+                    string_input_instances.append("\n".join(turns))
+
         if self.task == "rating.single_turn":
             instances = [
                 {
@@ -183,7 +203,7 @@ def _get_instance_for_judge_model(
                     "answer": prediction,
                 }
                 for input_instance, prediction, reference in zip(
-                    input_instances, predictions, references
+                    string_input_instances, predictions, references
                 )
             ]
         elif self.task == "rating.single_turn_with_reference":
@@ -194,7 +214,7 @@ def _get_instance_for_judge_model(
                     "reference_answer": reference[0],
                 }
                 for input_instance, prediction, reference in zip(
-                    input_instances, predictions, references
+                    string_input_instances, predictions, references
                 )
             ]
         elif self.task == "pairwise_comparative_rating.single_turn":
@@ -207,7 +227,7 @@ def _get_instance_for_judge_model(
                     "model_b": "baseline_model",
                 }
                 for input_instance, prediction, reference in zip(
-                    input_instances, predictions, references
+                    string_input_instances, predictions, references
                 )
             ]
         else:

From e921b01401a45ba06a91fc9b1744eafe2778c26e Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 15:48:30 +0200
Subject: [PATCH 23/26] Add inference documentation

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 docs/docs/examples.rst  |  32 +++++------
 docs/docs/inference.rst | 114 ++++++++++++++++++++++++++++++++++++++++
 docs/docs/tutorials.rst |   1 +
 3 files changed, 131 insertions(+), 16 deletions(-)

diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
index 35267d34de..f562b807f2 100644
--- a/docs/docs/examples.rst
+++ b/docs/docs/examples.rst
@@ -18,7 +18,7 @@ This example demonstrates how to evaluate an existing entailment dataset (wnli)
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_existing_dataset_no_install.py>`_
 
-Related documentation:  :ref:`Evaluating datasets <evaluating_datasets>`, :ref:`WNLI dataset card in catalog <catalog.cards.wnli>`, :ref:`Relation template in catalog <catalog.templates.classification.multi_class.relation.default>`.
+Related documentation:  :ref:`Evaluating datasets <evaluating_datasets>`, :ref:`WNLI dataset card in catalog <catalog.cards.wnli>`, :ref:`Relation template in catalog <catalog.templates.classification.multi_class.relation.default>`, :ref:`Inference Engines <inference>`.
 
 Evaluate an existing dataset from the Unitxt catalog (with Unitxt installation)
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -28,7 +28,7 @@ This approach is faster than using Huggingface APIs.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_existing_dataset_with_install.py>`_
 
-Related documentation: :ref:`Installation <installation>` , :ref:`WNLI dataset card in catalog <catalog.cards.wnli>`, :ref:`Relation template in catalog <catalog.templates.classification.multi_class.relation.default>`.
+Related documentation: :ref:`Installation <installation>` , :ref:`WNLI dataset card in catalog <catalog.cards.wnli>`, :ref:`Relation template in catalog <catalog.templates.classification.multi_class.relation.default>`, :ref:`Inference Engines <inference>`.
 
 
 Evaluate a custom dataset
@@ -48,7 +48,7 @@ It also shows how to use preprocessing steps to align the raw input of the datas
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/qa_evaluation.py>`_
 
-Related documentation: :ref:`Add new dataset tutorial <adding_dataset>`, :ref:`Open QA task in catalog <catalog.tasks.qa.open>`, :ref:`Open QA template in catalog <catalog.templates.qa.open.title>`.
+Related documentation: :ref:`Add new dataset tutorial <adding_dataset>`, :ref:`Open QA task in catalog <catalog.tasks.qa.open>`, :ref:`Open QA template in catalog <catalog.templates.qa.open.title>`, :ref:`Inference Engines <inference>`.
 
 
 Evaluation usecases
@@ -62,7 +62,7 @@ It also shows how to register assets into a local catalog and reuse them.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_different_templates.py>`_
 
-Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Formatting tutorial <adding_format>`, :ref:`Using the Catalog <using_catalog>`.
+Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Formatting tutorial <adding_format>`, :ref:`Using the Catalog <using_catalog>`, :ref:`Inference Engines <inference>`.
 
 Evaluate the impact of different formats and system prompts
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -92,7 +92,7 @@ This example demonstrates how to evaluate a dataset using a pool of templates an
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_different_templates_num_demos.py>`_
 
-Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Formatting tutorial <adding_format>`, :ref:`Using the Catalog <using_catalog>`.
+Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Formatting tutorial <adding_format>`, :ref:`Using the Catalog <using_catalog>`, :ref:`Inference Engines <inference>`.
 
 Long Context
 +++++++++++++++++++++++++++++
@@ -115,7 +115,7 @@ This example shows how to construct a benchmark that includes multiple datasets,
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_benchmark.py>`_
 
-Related documentation: :ref:`Benchmarks tutorial <adding_benchmark>`, :ref:`Formatting tutorial <adding_format>`, :ref:`Using the Catalog <using_catalog>`.
+Related documentation: :ref:`Benchmarks tutorial <adding_benchmark>`, :ref:`Formatting tutorial <adding_format>`, :ref:`Using the Catalog <using_catalog>`, :ref:`Inference Engines <inference>`.
 
 LLM as Judges
 --------------
@@ -127,7 +127,7 @@ This example demonstrates how to evaluate an existing QA dataset (squad) using t
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_existing_dataset_by_llm_as_judge.py>`_
 
-Related documentation: :ref:`Evaluating datasets <evaluating_datasets>`, :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+Related documentation: :ref:`Evaluating datasets <evaluating_datasets>`, :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`, :ref:`Inference Engines <inference>`.
 
 Evaluate a custom dataset using a custom LLM as Judge
 +++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -160,7 +160,7 @@ while the 70b model performs much better.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_llm_as_judge.py>`_
 
-Related documentation: :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+Related documentation: :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`, :ref:`Inference Engines <inference>`.
 
 
 Evaluate your model on the Arena Hard benchmark using a custom LLMaJ
@@ -170,7 +170,7 @@ This example demonstrates how to evaluate a user model on the Arena Hard benchma
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_a_model_using_arena_hard.py>`_
 
-Related documentation: :ref:`Evaluate a Model on Arena Hard Benchmark <arena_hard_evaluation>`.
+Related documentation: :ref:`Evaluate a Model on Arena Hard Benchmark <arena_hard_evaluation>`, :ref:`Inference Engines <inference>`.
 
 Evaluate a judge model performance judging the Arena Hard Benchmark
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -180,7 +180,7 @@ The model is evaluated on its capability to give a judgment that is in correlati
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py>`_
 
-Related documentation: :ref:`Evaluate a Model on Arena Hard Benchmark <arena_hard_evaluation>`.
+Related documentation: :ref:`Evaluate a Model on Arena Hard Benchmark <arena_hard_evaluation>`, :ref:`Inference Engines <inference>`.
 
 Evaluate using ensemble of LLM as a judge metrics
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -190,7 +190,7 @@ The example shows how to ensemble two judges which uses different templates.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_using_metrics_ensemble.py>`_
 
-Related documentation: :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+Related documentation: :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`, :ref:`Inference Engines <inference>`.
 
 Evaluate predictions of models using pre-trained ensemble of LLM as judges
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -206,7 +206,7 @@ Groundedness: Every substantial claim in the response of the model is derivable
 IDK: Does the model response say I don't know?
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_idk_judge.py>`
 
-Related documentation: :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`.
+Related documentation: :ref:`LLM as a Judge Metrics Guide <llm_as_judge>`, :ref:`Inference Engines <inference>`.
 
 RAG
 ---
@@ -222,7 +222,7 @@ and use the existing metrics to evaluate model results.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_rag_response_generation.py>`_
 
-Related documentation: :ref:`RAG Guide <rag_support>`.  :ref:`Response generation task <catalog.tasks.rag.response_generation>`.
+Related documentation: :ref:`RAG Guide <rag_support>`, :ref:`Response generation task <catalog.tasks.rag.response_generation>`, :ref:`Inference Engines <inference>`.
 
 Multi-Modality
 --------------
@@ -243,7 +243,7 @@ This approach can be adapted for various image-text to text tasks, such as image
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_image_text_to_text.py>`_
 
-Related documentation: :ref:`Multi-Modality Guide <multi_modality>`.
+Related documentation: :ref:`Multi-Modality Guide <multi_modality>`, :ref:`Inference Engines <inference>`.
 
 
 Evaluate Image-Text to Text Model With Different Templates
@@ -251,7 +251,7 @@ Evaluate Image-Text to Text Model With Different Templates
 Evaluate Image-Text to Text Models with different templates and explore the sensitivity of the model to different textual variations.
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_image_text_to_text_with_different_templates.py>`_
 
-Related documentation: :ref:`Multi-Modality Guide <multi_modality>`.
+Related documentation: :ref:`Multi-Modality Guide <multi_modality>`, :ref:`Inference Engines <inference>`.
 
 Types and Serializers
 ----------------------------
@@ -263,5 +263,5 @@ This example show how to define new data types as well as the way these data typ
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/custom_types.py>`_
 
-Related documentation: :ref:`Types and Serializers Guide <types_and_serializers>`.
+Related documentation: :ref:`Types and Serializers Guide <types_and_serializers>`, :ref:`Inference Engines <inference>`.
 
diff --git a/docs/docs/inference.rst b/docs/docs/inference.rst
index e69de29bb2..53d0913eff 100644
--- a/docs/docs/inference.rst
+++ b/docs/docs/inference.rst
@@ -0,0 +1,114 @@
+.. _inference:
+
+==============
+Inference
+==============
+
+.. note::
+
+   This tutorial requires a :ref:`Unitxt installation <install_unitxt>`.
+
+Introduction
+------------
+Unitxt offers a wide array of :class:`Inference Engines <unitxt.inference>` for running models either locally (using HuggingFace, Ollama, and VLLM) or by making API requests to services like WatsonX, AWS, and Together AI.
+
+Unitxt inference engines serve two main purposes:
+
+    1. Running a full end-to-end evaluation pipeline with inference.
+    2. Using models for intermediate steps, such as evaluating other models (e.g., LLMs as judges) or for data augmentation.
+
+Running Models Locally
+-----------------------
+You can run models locally with inference engines like:
+
+    - :class:`HFPipelineBasedInferenceEngine <unitxt.inference.HFPipelineBasedInferenceEngine>`
+    - :class:`VLLMInferenceEngine <unitxt.inference.VLLMInferenceEngine>`
+    - :class:`OllamaInferenceEngine <unitxt.inference.OllamaInferenceEngine>`
+
+To get started, prepare your engine:
+
+.. code-block:: python
+
+    engine = HFPipelineBasedInferenceEngine(
+        model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
+    )
+
+Then load the data:
+
+.. code-block:: python
+
+    dataset = load_dataset(
+        card="cards.xsum",
+        template="templates.summarization.abstractive.formal",
+        format="formats.chat_api",
+        metrics=[llm_judge_with_summary_metric],
+        loader_limit=5,
+        split="test",
+    )
+
+Notice: we create the data with  `format="formats.chat_api"` which produce data as list of chat turns:
+
+.. code-block:: python
+
+    [
+        {"role": "system", "content": "Summarize the following Document."},
+        {"role": "user", "content": "Document: <...>"}
+    ]
+
+Now run inference on the dataset:
+
+.. code-block:: python
+
+    predictions = engine.infer(dataset)
+
+Finally, evaluate the predictions and obtain final scores:
+
+.. code-block:: python
+
+    evaluate(predictions=predictions, data=dataset)
+
+Calling Models Through APIs
+---------------------------
+Calling models through an API is even simpler and is primarily done using one class: :class:`CrossProviderInferenceEngine <unitxt.inference.CrossProviderInferenceEngine>`.
+
+You can create a :class:`CrossProviderInferenceEngine` as follows:
+
+.. code-block:: python
+
+    engine = CrossProviderInferenceEngine(
+        model="llama-3-2-1b-instruct", provider="watsonx"
+    )
+
+This engine supports providers such as ``watsonx``, ``together-ai``, ``open-ai``, ``aws``, ``ollama``, ``bam``, and ``watsonx-sdk``.
+
+It can be used with all supported models listed here: :class:`supported models <unitxt.inference.CrossProviderInferenceEngine>`.
+
+Running inference follows the same pattern as before:
+
+.. code-block:: python
+
+    predictions = engine.infer(dataset)
+
+Creating a Cross-API Engine
+---------------------------
+Alternatively, you can create an engine without specifying a provider:
+
+.. code-block:: python
+
+    engine = CrossProviderInferenceEngine(
+        model="llama-3-2-1b-instruct"
+    )
+
+You can set the provider later by:
+
+.. code-block:: python
+
+    import unitxt
+
+    unitxt.settings.default_provider = "watsonx"
+
+Or by setting an environment variable:
+
+.. code-block:: bash
+
+    export UNITXT_DEFAULT_PROVIDER="watsonx"
\ No newline at end of file
diff --git a/docs/docs/tutorials.rst b/docs/docs/tutorials.rst
index fd01823d2d..8889e9928f 100644
--- a/docs/docs/tutorials.rst
+++ b/docs/docs/tutorials.rst
@@ -19,6 +19,7 @@ Tutorials ✨
    multimodality
    operators
    saving_and_loading_from_catalog
+   inference
    production
    debugging
    helm

From 9946ab6f7ceb004cbfdafe179ee20f3b4392fce6 Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 15:51:27 +0200
Subject: [PATCH 24/26] Fix examples

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 examples/evaluate_different_templates_num_demos.py | 9 +++++----
 examples/evaluate_existing_dataset_with_install.py | 1 -
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/evaluate_different_templates_num_demos.py b/examples/evaluate_different_templates_num_demos.py
index 9f27470268..645a8c3b32 100644
--- a/examples/evaluate_different_templates_num_demos.py
+++ b/examples/evaluate_different_templates_num_demos.py
@@ -11,14 +11,15 @@
     ],
     num_demos=[0, 5],
     group_by=["template", "num_demos", ["template", "num_demos"]],
-    demos_pool_size=100,
+    demos_pool_size=10,
     loader_limit=200,
+    max_test_instances=10,
+    split="test",
 )
 
-test = dataset["test"].to_list()
-predictions = ["entailment" for _ in test]
+predictions = ["entailment" for _ in dataset]
 
-results = evaluate(predictions=predictions, data=test)
+results = evaluate(predictions=predictions, data=dataset)
 
 # Print the resulting scores per group.
 logger.info(results[0]["score"]["groups"])
diff --git a/examples/evaluate_existing_dataset_with_install.py b/examples/evaluate_existing_dataset_with_install.py
index 5ee72ccae1..9c74d50e80 100644
--- a/examples/evaluate_existing_dataset_with_install.py
+++ b/examples/evaluate_existing_dataset_with_install.py
@@ -6,7 +6,6 @@
 # We set loader_limit to 20 to limit reduce inference time.
 dataset = load_dataset(
     card="cards.wnli",
-    system_prompt="system_prompts.be_concise",
     template="templates.classification.multi_class.relation.default",
     format="formats.chat_api",
     num_demos=2,

From ececf85285be05b93a6c228ad14d7513911719fa Mon Sep 17 00:00:00 2001
From: elronbandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 17:18:42 +0200
Subject: [PATCH 25/26] Fix examples

Signed-off-by: elronbandel <elronbandel@gmail.com>
---
 ...valuate_a_judge_model_capabilities_on_arena_hard.py |  4 ++--
 ...same_datasets_and_models_with_multiple_providers.py | 10 +++++-----
 tests/examples/test_examples.py                        |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
index 4b608596a2..a94fa1ccaf 100644
--- a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
+++ b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
@@ -10,9 +10,9 @@
     card="cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge",
     template="templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling",
     format="formats.chat_api",
-    max_test_instances=4,
+    max_test_instances=None,
     split="test",
-)
+).select(range(5))
 
 inference_model = CrossProviderInferenceEngine(
     model="llama-3-2-1b-instruct", provider="watsonx"
diff --git a/examples/evaluate_same_datasets_and_models_with_multiple_providers.py b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
index 6f25b6e9a8..c56aedb3c6 100644
--- a/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
+++ b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
@@ -56,21 +56,21 @@
                 template_card_index=0,
                 format=format,
                 num_demos=1,
-                demos_pool_size=100,
+                demos_pool_size=10,
                 loader_limit=1000,
-                max_test_instances=500,
+                max_test_instances=10,
                 disable_cache=False,
+                split="test",
             )
 
-            test_dataset = dataset["test"]
             from unitxt.inference import CrossProviderInferenceEngine
 
             inference_model = CrossProviderInferenceEngine(
                 model=model_name, max_tokens=1024, provider=provider
             )
-            predictions = inference_model.infer(test_dataset)
+            predictions = inference_model.infer(dataset)
 
-            evaluated_dataset = evaluate(predictions=predictions, data=test_dataset)
+            evaluated_dataset = evaluate(predictions=predictions, data=dataset)
             # import pandas as pd
             # result_df = pd.json_normalize(evaluated_dataset)
             # result_df.to_csv(f"output.csv")
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
index 49e21fffed..cdacefea00 100644
--- a/tests/examples/test_examples.py
+++ b/tests/examples/test_examples.py
@@ -27,7 +27,7 @@
     # "evaluate_different_formats.py",
     # "evaluate_different_templates.py",
     # "evaluate_different_demo_selections.py",
-    # "evaluate_a_judge_model_capabilities_on_arena_hard.py",
+    "evaluate_a_judge_model_capabilities_on_arena_hard.py",
     # "evaluate_a_model_using_arena_hard.py",
     # "evaluate_llm_as_judge.py",
     "evaluate_using_metrics_ensemble.py",

From 71365e70451fb1622c377c5465604058583f738b Mon Sep 17 00:00:00 2001
From: Elron Bandel <elronbandel@gmail.com>
Date: Tue, 19 Nov 2024 17:19:23 +0200
Subject: [PATCH 26/26] Update docs/docs/inference.rst

Co-authored-by: Yoav Katz <68273864+yoavkatz@users.noreply.github.com>
---
 docs/docs/inference.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/inference.rst b/docs/docs/inference.rst
index 53d0913eff..99c02669c9 100644
--- a/docs/docs/inference.rst
+++ b/docs/docs/inference.rst
@@ -10,7 +10,7 @@ Inference
 
 Introduction
 ------------
-Unitxt offers a wide array of :class:`Inference Engines <unitxt.inference>` for running models either locally (using HuggingFace, Ollama, and VLLM) or by making API requests to services like WatsonX, AWS, and Together AI.
+Unitxt offers a wide array of :class:`Inference Engines <unitxt.inference>` for running models either locally (using HuggingFace, Ollama, and VLLM) or by making API requests to services like WatsonX, OpenAI, AWS, and Together AI.
 
 Unitxt inference engines serve two main purposes: