From de868ab67cf1d67d293f8621f6f0ac6811462bef Mon Sep 17 00:00:00 2001 From: elronbandel Date: Tue, 12 Nov 2024 17:10:53 +0200 Subject: [PATCH 01/20] Add multi api inference engine Signed-off-by: elronbandel --- .../evaluate_benchmark_with_custom_api.py | 30 ++++++ prepare/engines/multi_api/llama3.py | 16 +++ .../engines/model/llama_3_8b_instruct.json | 12 +++ src/unitxt/inference.py | 100 +++++++++++++++--- src/unitxt/settings_utils.py | 2 + src/unitxt/standard.py | 12 ++- 6 files changed, 155 insertions(+), 17 deletions(-) create mode 100644 examples/evaluate_benchmark_with_custom_api.py create mode 100644 prepare/engines/multi_api/llama3.py create mode 100644 src/unitxt/catalog/engines/model/llama_3_8b_instruct.json diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py new file mode 100644 index 000000000..5db2e418d --- /dev/null +++ b/examples/evaluate_benchmark_with_custom_api.py @@ -0,0 +1,30 @@ +import unitxt +from unitxt import evaluate, get_from_catalog, load_dataset +from unitxt.text_utils import print_dict + +with unitxt.settings.context( + default_inference_api="watsonx", # option a to define your home api + default_format="formats.chat_api", + disable_hf_datasets_cache=False, +): + data = load_dataset("benchmarks.glue[max_samples_per_subset=5]", split="test") + + model = get_from_catalog( + "engines.model.llama_3_8b_instruct[api=watsonx]" + ) # option b to define your home api + + predictions = model.infer(data) + + evaluated_dataset = evaluate(predictions=predictions, data=data) + + print_dict( + evaluated_dataset[0], + keys_to_print=[ + "source", + "prediction", + "subset", + ], + ) + print_dict( + evaluated_dataset[0]["score"]["subsets"], + ) diff --git a/prepare/engines/multi_api/llama3.py b/prepare/engines/multi_api/llama3.py new file mode 100644 index 000000000..8ebaa4adf --- /dev/null +++ b/prepare/engines/multi_api/llama3.py @@ -0,0 +1,16 @@ +from unitxt.catalog import add_to_catalog +from unitxt.inference import MultiAPIInferenceEngine + +engine = MultiAPIInferenceEngine( + model="llama-3-8b-instruct", + api_model_map={ + "watsonx": { + "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", + }, + "together-ai": { + "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct" + }, + }, +) + +add_to_catalog(engine, "engines.model.llama_3_8b_instruct", overwrite=True) diff --git a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json new file mode 100644 index 000000000..a6c2be46c --- /dev/null +++ b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json @@ -0,0 +1,12 @@ +{ + "__type__": "multi_api_inference_engine", + "model": "llama-3-8b-instruct", + "api_model_map": { + "watsonx": { + "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct" + }, + "together-ai": { + "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct" + } + } +} diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 7604fb341..9f80b220d 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1121,9 +1121,9 @@ def _infer( model, params = self._load_model_and_params() result = [] - for instance in dataset: + for source in dataset["source"]: instance_result = model.generate( - prompt=instance["source"], + prompt=source, params=self.to_dict([WMLInferenceEngineParamsMixin], keep_empty=False), ) prediction = instance_result["results"][0]["generated_text"] @@ -1364,9 +1364,7 @@ class LMMSEvalBaseInferenceEngine( batch_size: int = 1 image_token = "" - _requirements_list = { - "lmms_eval": "Install llms-eval package using 'pip install lmms-eval==0.2.4'", - } + _requirements_list = ["lmms-eval==0.2.4"] def prepare_engine(self): if not self.lazy_load: @@ -1413,6 +1411,7 @@ def _infer( dataset: Union[List[Dict[str, Any]], DatasetDict], return_meta_data: bool = False, ) -> Union[List[str], List[TextGenerationInferenceOutput]]: + self.verify_not_chat_api(dataset) if not self._is_loaded(): self._prepare_engine() @@ -1562,12 +1561,26 @@ async def acquire(self, tokens=1): await asyncio.sleep(time_until_next_token) -class LiteLLMInferenceEngine(InferenceEngine, PackageRequirementsMixin): +class StandardAPIParamsMixin(Artifact): model: str - max_tokens: int = 256 - seed: int = 1 - temperature: float = 0.0 - top_p: float = 1.0 + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + max_tokens: Optional[int] = None + seed: Optional[int] = None + stop: Union[Optional[str], List[str]] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + top_logprobs: Optional[int] = 20 + logit_bias: Optional[Dict[str, int]] = None + logprobs: Optional[bool] = True + n: Optional[int] = None + parallel_tool_calls: Optional[bool] = None + service_tier: Optional[Literal["auto", "default"]] = None + + +class LiteLLMInferenceEngine( + InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin +): max_requests_per_second: float = 6 max_retries: int = 5 # Set to 0 to prevent internal retries @@ -1599,15 +1612,12 @@ async def _infer_instance( # Introduce a slight delay to prevent burstiness await asyncio.sleep(0.01) messages = self.to_messages(instance) + kwargs = self.to_dict([StandardAPIParamsMixin]) response = await self._completion( - model=self.model, messages=messages, - seed=self.seed, - max_tokens=self.max_tokens, - temperature=self.temperature, - top_p=self.top_p, max_retries=self.max_retries, caching=True, + **kwargs, ) usage = response.get("usage", {}) return TextGenerationInferenceOutput( @@ -1643,3 +1653,63 @@ def _infer( return responses return [response.prediction for response in responses] + + +_supported_apis = Literal["watsonx", "together-ai", "open-ai"] + + +class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): + """Inference engine capable of dynamically switching between multiple APIs. + + This class extends the InferenceEngine and OpenAiInferenceEngineParamsMixin + to enable seamless integration with various API providers. The supported APIs are + specified in `_supported_apis`, allowing users to interact with multiple models + from different sources. The `api_model_map` dictionary maps each API to + specific model identifiers, enabling automatic configuration based on + user requests. + + Attributes: + api: Optional; Specifies the current API in use. Must be one of the + literals in `_supported_apis`. + api_model_map: Dictionary mapping each supported API to a corresponding + model identifier string. This mapping allows consistent access to models + across different API backends. + """ + + api: Optional[_supported_apis] = None + + api_model_map: Dict[_supported_apis, Dict[str, str]] = { + "watsonx": { + "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", + }, + "together-ai": { + "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct" + }, + } + + _api_to_base_class = { + "watsonx": LiteLLMInferenceEngine, + "open-ai": LiteLLMInferenceEngine, + "together-ai": LiteLLMInferenceEngine, + } + + def get_api_name(self): + return self.api if self.api is not None else settings.default_inference_api + + def prepare_engine(self): + api = self.get_api_name() + cls = self.__class__._api_to_base_class[api] + args = self.to_dict([OpenAiInferenceEngineParamsMixin]) + args["model"] = self.api_model_map[api][self.model] + self.engine = cls(**args) + + def _infer( + self, + dataset: List[Dict[str, Any]] | DatasetDict, + return_meta_data: bool = False, + ) -> Union[List[str], List[TextGenerationInferenceOutput]]: + return self.engine._infer(dataset, return_meta_data) + + def get_engine_id(self): + api = self.get_api_name() + return get_model_and_label_id(self.api_model_map[api][self.model], api) diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py index 6bde1718c..47ec7faf7 100644 --- a/src/unitxt/settings_utils.py +++ b/src/unitxt/settings_utils.py @@ -151,6 +151,8 @@ def __getattr__(self, key): settings.disable_hf_datasets_cache = (bool, True) settings.loader_cache_size = (int, 1) settings.task_data_as_text = (bool, True) + settings.default_inference_api = "watsonx" + settings.default_format = None if Constants.is_uninitilized(): constants = Constants() diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py index b9989d82b..ba2607e31 100644 --- a/src/unitxt/standard.py +++ b/src/unitxt/standard.py @@ -1,5 +1,6 @@ from typing import List, Optional, Union +from .artifact import fetch_artifact from .augmentors import ( Augmentor, FinalStateInputsAugmentor, @@ -16,7 +17,7 @@ from .recipe import Recipe from .schema import FinalizeDataset from .serializers import SingleTypeSerializer -from .settings_utils import get_constants +from .settings_utils import get_constants, get_settings from .splitters import ConstantSizeSample, RandomSizeSample, Sampler, SeparateSplit from .stream import MultiStream from .system_prompts import EmptySystemPrompt, SystemPrompt @@ -25,6 +26,7 @@ from .utils import LRUCache constants = get_constants() +settings = get_settings() logger = get_logger() @@ -39,7 +41,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator): task: Task = None template: Union[Template, List[Template], TemplatesList] = None system_prompt: SystemPrompt = Field(default_factory=EmptySystemPrompt) - format: Format = Field(default_factory=SystemFormat) + format: Format = None serializer: Union[SingleTypeSerializer, List[SingleTypeSerializer]] = None # Additional parameters @@ -263,6 +265,12 @@ def produce(self, task_instances): return list(multi_stream[constants.inference_stream]) def reset_pipeline(self): + if self.format is None: + if settings.default_format is not None: + self.format, _ = fetch_artifact(settings.default_format) + else: + self.format = SystemFormat() + if self.card and self.card.preprocess_steps is None: self.card.preprocess_steps = [] From c40d87dbb2eb98acb3d8c4262994abb75bfa8c96 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Tue, 12 Nov 2024 17:20:10 +0200 Subject: [PATCH 02/20] Fix Signed-off-by: elronbandel --- src/unitxt/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 9f80b220d..d9494482b 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1705,7 +1705,7 @@ def prepare_engine(self): def _infer( self, - dataset: List[Dict[str, Any]] | DatasetDict, + dataset: Union[List[Dict[str, Any]], DatasetDict], return_meta_data: bool = False, ) -> Union[List[str], List[TextGenerationInferenceOutput]]: return self.engine._infer(dataset, return_meta_data) From d53eb69007ddfd8dcef9a7bef77f91a4acf1f784 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Tue, 12 Nov 2024 17:24:10 +0200 Subject: [PATCH 03/20] Set to greedy decoding Signed-off-by: elronbandel --- examples/evaluate_benchmark_with_custom_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py index 5db2e418d..fd232cea0 100644 --- a/examples/evaluate_benchmark_with_custom_api.py +++ b/examples/evaluate_benchmark_with_custom_api.py @@ -10,7 +10,7 @@ data = load_dataset("benchmarks.glue[max_samples_per_subset=5]", split="test") model = get_from_catalog( - "engines.model.llama_3_8b_instruct[api=watsonx]" + "engines.model.llama_3_8b_instruct[api=watsonx,top_k=1]" ) # option b to define your home api predictions = model.infer(data) From b36b7ab8f56fd871a6a6fae1e8685d740b810bfe Mon Sep 17 00:00:00 2001 From: elronbandel Date: Sun, 17 Nov 2024 21:40:50 +0200 Subject: [PATCH 04/20] Some fixes Signed-off-by: elronbandel --- .../evaluate_benchmark_with_custom_api.py | 45 ++++++------ src/unitxt/benchmark.py | 25 +++++-- src/unitxt/inference.py | 72 +++++++++++-------- src/unitxt/standard.py | 3 + tests/library/test_benchmark.py | 34 +++++++++ 5 files changed, 121 insertions(+), 58 deletions(-) diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py index fd232cea0..12e2bbc57 100644 --- a/examples/evaluate_benchmark_with_custom_api.py +++ b/examples/evaluate_benchmark_with_custom_api.py @@ -1,30 +1,27 @@ -import unitxt -from unitxt import evaluate, get_from_catalog, load_dataset +from unitxt import evaluate, load_dataset +from unitxt.inference import MultiAPIInferenceEngine from unitxt.text_utils import print_dict -with unitxt.settings.context( - default_inference_api="watsonx", # option a to define your home api - default_format="formats.chat_api", - disable_hf_datasets_cache=False, -): - data = load_dataset("benchmarks.glue[max_samples_per_subset=5]", split="test") +data = load_dataset( + "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api]", + split="test", + disable_cache=False, +) - model = get_from_catalog( - "engines.model.llama_3_8b_instruct[api=watsonx,top_k=1]" - ) # option b to define your home api +model = MultiAPIInferenceEngine(model="llama-3-8b-instruct", top_k=1, api="watsonx") - predictions = model.infer(data) +predictions = model.infer(data) - evaluated_dataset = evaluate(predictions=predictions, data=data) +evaluated_dataset = evaluate(predictions=predictions, data=data) - print_dict( - evaluated_dataset[0], - keys_to_print=[ - "source", - "prediction", - "subset", - ], - ) - print_dict( - evaluated_dataset[0]["score"]["subsets"], - ) +print_dict( + evaluated_dataset[0], + keys_to_print=[ + "source", + "prediction", + "subset", + ], +) +print_dict( + evaluated_dataset[0]["score"]["subsets"], +) diff --git a/src/unitxt/benchmark.py b/src/unitxt/benchmark.py index 33f65d011..7678dc175 100644 --- a/src/unitxt/benchmark.py +++ b/src/unitxt/benchmark.py @@ -1,3 +1,4 @@ +from abc import abstractmethod from typing import Dict, Union from .dataclass import NonPositionalField @@ -15,6 +16,10 @@ class BaseBenchmark(SourceOperator): system_prompt: SystemPrompt = NonPositionalField(default=None) loader_limit: int = NonPositionalField(default=None) + @abstractmethod + def reset(self): + pass + class Benchmark(BaseBenchmark): subsets: Dict[str, Union[StandardRecipe, BaseBenchmark]] @@ -23,16 +28,20 @@ class Benchmark(BaseBenchmark): max_samples_per_subset: int = None def verify(self): + super().verify() if ( self.max_total_samples is not None and self.max_samples_per_subset is not None ): raise ValueError("Set either max_total_samples or max_samples_per_subset") - def prepare(self): - for subset in self.subsets.values(): - subset.loader_limit = self.loader_limit - if self.format is not None or self.num_demos is not None: + def reset(self): + if ( + self.format is not None + or self.num_demos is not None + or self.system_prompt is not None + or self.loader_limit is not None + ): for subset in self.subsets.values(): if self.num_demos is not None: subset.num_demos = self.num_demos @@ -42,7 +51,13 @@ def prepare(self): subset.system_prompt = self.system_prompt if self.loader_limit is not None: subset.loader_limit = self.loader_limit - subset.prepare() + + subset.reset() + + def prepare(self): + super().prepare() + + self.reset() def process( self, diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index b87116ca7..02085327e 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -28,6 +28,24 @@ logger = get_logger() +class StandardAPIParamsMixin(Artifact): + model: str + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + max_tokens: Optional[int] = None + seed: Optional[int] = None + stop: Union[Optional[str], List[str]] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + top_logprobs: Optional[int] = 20 + logit_bias: Optional[Dict[str, int]] = None + logprobs: Optional[bool] = True + n: Optional[int] = None + parallel_tool_calls: Optional[bool] = None + service_tier: Optional[Literal["auto", "default"]] = None + + def get_model_and_label_id(model_name, label): model_id = model_name.split("/")[-1].replace("-", "_").replace(".", ",").lower() return f"{model_id}_{label}" @@ -372,16 +390,17 @@ def _infer( return self.engine._infer(dataset) -class OllamaInferenceEngine(InferenceEngine, PackageRequirementsMixin): +class OllamaInferenceEngine( + InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin +): label: str = "ollama" - model_name: str _requirements_list = { "ollama": "Install ollama package using 'pip install --upgrade ollama" } data_classification_policy = ["public", "proprietary"] def get_engine_id(self): - return get_model_and_label_id(self.model_name, self.label) + return get_model_and_label_id(self.model, self.label) def prepare_engine(self): pass @@ -393,13 +412,16 @@ def _infer( ) -> Union[List[str], List[TextGenerationInferenceOutput]]: import ollama + args = self.to_dict([StandardAPIParamsMixin]) + results = [] for instance in dataset: messages = self.to_messages(instance) response = ollama.chat( - model=self.model_name, + model=self.model, messages=messages, + **args, ) results.append(response) @@ -1562,23 +1584,6 @@ async def acquire(self, tokens=1): await asyncio.sleep(time_until_next_token) -class StandardAPIParamsMixin(Artifact): - model: str - frequency_penalty: Optional[float] = None - presence_penalty: Optional[float] = None - max_tokens: Optional[int] = None - seed: Optional[int] = None - stop: Union[Optional[str], List[str]] = None - temperature: Optional[float] = None - top_p: Optional[float] = None - top_logprobs: Optional[int] = 20 - logit_bias: Optional[Dict[str, int]] = None - logprobs: Optional[bool] = True - n: Optional[int] = None - parallel_tool_calls: Optional[bool] = None - service_tier: Optional[Literal["auto", "default"]] = None - - class LiteLLMInferenceEngine( InferenceEngine, StandardAPIParamsMixin, PackageRequirementsMixin ): @@ -1616,7 +1621,6 @@ async def _infer_instance( kwargs = self.to_dict([StandardAPIParamsMixin]) try: response = await self._completion( - model=self.model, messages=messages, max_retries=self.max_retries, caching=True, @@ -1663,8 +1667,7 @@ def _infer( return [response.prediction for response in responses] - -_supported_apis = Literal["watsonx", "together-ai", "open-ai"] +_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama"] class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): @@ -1690,9 +1693,19 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): api_model_map: Dict[_supported_apis, Dict[str, str]] = { "watsonx": { "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", + "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct", }, "together-ai": { - "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct" + "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct", + "llama-3-70b-instruct": "together_ai/togethercomputer/llama-3-70b-instruct", + }, + "aws": { + "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0", + "llama-3-70b-instruct": "bedrock/meta.llama3-70b-instruct-v1:0", + }, + "ollama": { + "llama-3-8b-instruct": "llama3:8b", + "llama-3-70b-instruct": "llama3:70b", }, } @@ -1700,6 +1713,8 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "watsonx": LiteLLMInferenceEngine, "open-ai": LiteLLMInferenceEngine, "together-ai": LiteLLMInferenceEngine, + "aws": LiteLLMInferenceEngine, + "ollama": OllamaInferenceEngine, } def get_api_name(self): @@ -1708,10 +1723,10 @@ def get_api_name(self): def prepare_engine(self): api = self.get_api_name() cls = self.__class__._api_to_base_class[api] - args = self.to_dict([OpenAiInferenceEngineParamsMixin]) + args = self.to_dict([StandardAPIParamsMixin]) args["model"] = self.api_model_map[api][self.model] self.engine = cls(**args) - + def _infer( self, dataset: Union[List[Dict[str, Any]], DatasetDict], @@ -1723,6 +1738,7 @@ def get_engine_id(self): api = self.get_api_name() return get_model_and_label_id(self.api_model_map[api][self.model], api) + class HFOptionSelectingInferenceEngine(InferenceEngine): """HuggingFace based class for inference engines that calculate log probabilities. @@ -1797,13 +1813,11 @@ def get_log_probs(self, texts): return log_probs - def _infer( self, dataset: Union[List[Dict[str, Any]], DatasetDict], return_meta_data: bool = False, ) -> Union[List[str], List[TextGenerationInferenceOutput]]: - inputs = [] for instance in dataset: diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py index 5a3c6017f..9f0d8add6 100644 --- a/src/unitxt/standard.py +++ b/src/unitxt/standard.py @@ -264,6 +264,9 @@ def produce(self, task_instances): multi_stream = self.inference(multi_stream) return list(multi_stream[constants.inference_stream]) + def reset(self): + self.reset_pipeline() + def reset_pipeline(self): if self.format is None: if settings.default_format is not None: diff --git a/tests/library/test_benchmark.py b/tests/library/test_benchmark.py index a4efb82ad..21579d97e 100644 --- a/tests/library/test_benchmark.py +++ b/tests/library/test_benchmark.py @@ -97,3 +97,37 @@ def test_benchmark(self): }, ], ) + + def test_benchmark_format_trickling(self): + benchmark = Benchmark( + format="formats.chat_api", + max_samples_per_subset=2, + loader_limit=30, + subsets={ + "cola": Benchmark( + format="formats.user_agent", + max_samples_per_subset=1, + loader_limit=300, + subsets={ + "cola": StandardRecipe( + card="cards.cola", + template="templates.classification.multi_class.instruction", + ), + "wnli": StandardRecipe( + card="cards.wnli", + format="formats.empty", + template="templates.classification.multi_class.relation.default", + ), + }, + ), + "wnli": StandardRecipe( + card="cards.wnli", + template="templates.classification.multi_class.relation.default", + ), + }, + ) + + test_dataset = list(benchmark()["test"]) + + for instance in test_dataset: + self.assertTrue(instance["source"].startswith('[{"role": ')) From 059378812898d1dd0329b07d0008c3b333d243c3 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 09:35:03 +0200 Subject: [PATCH 05/20] Fix consistency and preparation Signed-off-by: elronbandel --- prepare/engines/ollama/llama2.py | 2 +- src/unitxt/catalog/engines/ollama/llama2.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/prepare/engines/ollama/llama2.py b/prepare/engines/ollama/llama2.py index e53999e2c..b89e90099 100644 --- a/prepare/engines/ollama/llama2.py +++ b/prepare/engines/ollama/llama2.py @@ -1,5 +1,5 @@ from unitxt.catalog import add_to_catalog from unitxt.inference import OllamaInferenceEngine -inference_model = OllamaInferenceEngine(model_name="llama2") +inference_model = OllamaInferenceEngine(model="llama2") add_to_catalog(inference_model, "engines.ollama.llama2", overwrite=True) diff --git a/src/unitxt/catalog/engines/ollama/llama2.json b/src/unitxt/catalog/engines/ollama/llama2.json index 3c5c39cc9..9aec1ded5 100644 --- a/src/unitxt/catalog/engines/ollama/llama2.json +++ b/src/unitxt/catalog/engines/ollama/llama2.json @@ -1,4 +1,4 @@ { "__type__": "ollama_inference_engine", - "model_name": "llama2" + "model": "llama2" } From 28bafa2c301d97955bcc8f449751bff9f349ea0a Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 11:27:16 +0200 Subject: [PATCH 06/20] Update Signed-off-by: elronbandel --- examples/evaluate_benchmark_with_custom_api.py | 6 ++++-- ...evaluate_image_text_to_text_with_different_templates.py | 2 +- prepare/system_prompts/general/be_concise.py | 7 +++++++ pyproject.toml | 2 +- src/unitxt/catalog/system_prompts/general/be_concise.json | 4 ++++ src/unitxt/inference.py | 1 - 6 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 prepare/system_prompts/general/be_concise.py create mode 100644 src/unitxt/catalog/system_prompts/general/be_concise.json diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_api.py index 12e2bbc57..379c6f605 100644 --- a/examples/evaluate_benchmark_with_custom_api.py +++ b/examples/evaluate_benchmark_with_custom_api.py @@ -3,12 +3,14 @@ from unitxt.text_utils import print_dict data = load_dataset( - "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api]", + "benchmarks.glue[max_samples_per_subset=5, format=formats.chat_api, system_prompt=system_prompts.general.be_concise]", split="test", disable_cache=False, ) -model = MultiAPIInferenceEngine(model="llama-3-8b-instruct", top_k=1, api="watsonx") +model = MultiAPIInferenceEngine( + model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, api="watsonx" +) predictions = model.infer(data) diff --git a/examples/evaluate_image_text_to_text_with_different_templates.py b/examples/evaluate_image_text_to_text_with_different_templates.py index bf425b782..6e2d132d4 100644 --- a/examples/evaluate_image_text_to_text_with_different_templates.py +++ b/examples/evaluate_image_text_to_text_with_different_templates.py @@ -48,5 +48,5 @@ for subset in dataset.subsets: logger.info( - f"{subset.title()}: ", results[0]["score"]["subsets"][subset]["score"] + f'{subset.title()}: {results[0]["score"]["subsets"][subset]["score"]}' ) diff --git a/prepare/system_prompts/general/be_concise.py b/prepare/system_prompts/general/be_concise.py new file mode 100644 index 000000000..6ac906822 --- /dev/null +++ b/prepare/system_prompts/general/be_concise.py @@ -0,0 +1,7 @@ +from unitxt.catalog import add_to_catalog +from unitxt.system_prompts import TextualSystemPrompt + +system_prompt = TextualSystemPrompt( + "be concise. at every point give the shortest acceptable answer." +) +add_to_catalog(system_prompt, "system_prompts.general.be_concise", overwrite=True) diff --git a/pyproject.toml b/pyproject.toml index 4159c8c85..1c575e03e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,7 +114,7 @@ watsonx = [ "ibm-watsonx-ai==1.1.14" ] inference-tests = [ - "litellm @ git+https://github.com/BerriAI/litellm.git@main", + "litellm==v1.52.9", "tenacity", "diskcache", "numpy==1.26.4" diff --git a/src/unitxt/catalog/system_prompts/general/be_concise.json b/src/unitxt/catalog/system_prompts/general/be_concise.json new file mode 100644 index 000000000..01e54b862 --- /dev/null +++ b/src/unitxt/catalog/system_prompts/general/be_concise.json @@ -0,0 +1,4 @@ +{ + "__type__": "textual_system_prompt", + "text": "be concise. at every point give the shortest acceptable answer." +} diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 02085327e..079e32cd4 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -37,7 +37,6 @@ class StandardAPIParamsMixin(Artifact): stop: Union[Optional[str], List[str]] = None temperature: Optional[float] = None top_p: Optional[float] = None - top_k: Optional[int] = None top_logprobs: Optional[int] = 20 logit_bias: Optional[Dict[str, int]] = None logprobs: Optional[bool] = True From 3c861fbee0be3e0ce005b405c14458854deb23fd Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 13:58:06 +0200 Subject: [PATCH 07/20] Fix test Signed-off-by: elronbandel --- tests/inference/test_inference_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py index 12909913b..a1f81495a 100644 --- a/tests/inference/test_inference_engine.py +++ b/tests/inference/test_inference_engine.py @@ -79,7 +79,7 @@ def test_dataset_verification_inference_engine(self): f"data with classification '{inference_model.data_classification_policy}'. To " f"enable this either change the 'data_classification_policy' attribute of the " f"artifact, or modify the environment variable 'UNITXT_DATA_CLASSIFICATION_POLICY' " - f"accordingly.", + f"accordingly.\nFor more information: see https://www.unitxt.ai/en/latest//docs/data_classification_policy.html \n", ) def test_llava_inference_engine(self): From f9cd539c848284b27bb172c2ce055811aaaaa883 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 14:17:31 +0200 Subject: [PATCH 08/20] Make all args None Signed-off-by: elronbandel --- src/unitxt/inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 079e32cd4..5acb50d01 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -37,9 +37,9 @@ class StandardAPIParamsMixin(Artifact): stop: Union[Optional[str], List[str]] = None temperature: Optional[float] = None top_p: Optional[float] = None - top_logprobs: Optional[int] = 20 + top_logprobs: Optional[int] = None logit_bias: Optional[Dict[str, int]] = None - logprobs: Optional[bool] = True + logprobs: Optional[bool] = None n: Optional[int] = None parallel_tool_calls: Optional[bool] = None service_tier: Optional[Literal["auto", "default"]] = None From 4165c783aef6214a8ee5756d676bb2f4ed263293 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 14:51:14 +0200 Subject: [PATCH 09/20] Try Signed-off-by: elronbandel --- src/unitxt/operators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py index 8edf8c32b..b761deecc 100644 --- a/src/unitxt/operators.py +++ b/src/unitxt/operators.py @@ -400,7 +400,7 @@ def verify_field_definition(self): ), f"the from and to fields must be defined or implied from the other inputs got: {self._field_to_field}" assert ( len(self._field_to_field) > 0 - ), f"'input argument 'field_to_field' should convey at least one field to process. Got {self.field_to_field}" + ), f"'input argument '{self.__class__.__name__}.field_to_field' should convey at least one field to process. Got {self.field_to_field}" # self._field_to_field is built explicitly by pairs, or copied from argument 'field_to_field' if self.field_to_field is None: return From f202c3a7947d5a3033bd6c92e50bcfc3b9a8fdd7 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 15:10:44 +0200 Subject: [PATCH 10/20] Fix grammar Signed-off-by: elronbandel --- prepare/system_prompts/general/be_concise.py | 2 +- src/unitxt/catalog/system_prompts/general/be_concise.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/prepare/system_prompts/general/be_concise.py b/prepare/system_prompts/general/be_concise.py index 6ac906822..4b5d69aed 100644 --- a/prepare/system_prompts/general/be_concise.py +++ b/prepare/system_prompts/general/be_concise.py @@ -2,6 +2,6 @@ from unitxt.system_prompts import TextualSystemPrompt system_prompt = TextualSystemPrompt( - "be concise. at every point give the shortest acceptable answer." + "Be concise. At every point give the shortest acceptable answer." ) add_to_catalog(system_prompt, "system_prompts.general.be_concise", overwrite=True) diff --git a/src/unitxt/catalog/system_prompts/general/be_concise.json b/src/unitxt/catalog/system_prompts/general/be_concise.json index 01e54b862..48f28b5bb 100644 --- a/src/unitxt/catalog/system_prompts/general/be_concise.json +++ b/src/unitxt/catalog/system_prompts/general/be_concise.json @@ -1,4 +1,4 @@ { "__type__": "textual_system_prompt", - "text": "be concise. at every point give the shortest acceptable answer." + "text": "Be concise. At every point give the shortest acceptable answer." } From bd8e176fda74a4c233c85a7cab0e63acbbb6bc6c Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 15:25:01 +0200 Subject: [PATCH 11/20] Fix Signed-off-by: elronbandel --- examples/evaluate_benchmark.py | 3 +-- .../evaluate_different_demo_selections.py | 9 ++++---- examples/evaluate_image_text_to_text.py | 10 ++++---- prepare/tasks/qa/multiple_choice/tasks.py | 4 ++++ .../tasks/qa/multiple_choice/open.json | 3 +++ .../qa/multiple_choice/with_context.json | 4 ++++ .../with_context/with_topic.json | 4 ++++ .../tasks/qa/multiple_choice/with_topic.json | 4 ++++ src/unitxt/standard.py | 23 +++++++++++-------- 9 files changed, 43 insertions(+), 21 deletions(-) diff --git a/examples/evaluate_benchmark.py b/examples/evaluate_benchmark.py index dca439f16..e92b7e309 100644 --- a/examples/evaluate_benchmark.py +++ b/examples/evaluate_benchmark.py @@ -48,9 +48,8 @@ # Infere using flan t5 base using HF API -model_name = "google/flan-t5-base" inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 + model_name="google/flan-t5-base", max_new_tokens=32 ) predictions = inference_model.infer(test_dataset) diff --git a/examples/evaluate_different_demo_selections.py b/examples/evaluate_different_demo_selections.py index 61d7d6837..d494b89f4 100644 --- a/examples/evaluate_different_demo_selections.py +++ b/examples/evaluate_different_demo_selections.py @@ -32,14 +32,13 @@ num_demos=num_demos, demos_pool_size=50, loader_limit=200, - max_test_instances=100, + max_test_instances=10, sampler=demo_sampler, + split="test", ) - test_dataset = dataset["test"] - - predictions = inference_model.infer(test_dataset) - evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + predictions = inference_model.infer(dataset) + evaluated_dataset = evaluate(predictions=predictions, data=dataset) logger.info( f"Sample input and output for sampler {demo_sampler} and num_demos '{num_demos}':" diff --git a/examples/evaluate_image_text_to_text.py b/examples/evaluate_image_text_to_text.py index a4f0dc6c1..1edbe02e6 100644 --- a/examples/evaluate_image_text_to_text.py +++ b/examples/evaluate_image_text_to_text.py @@ -6,19 +6,19 @@ with settings.context( disable_hf_datasets_cache=False, ): - inference_model = HFLlavaInferenceEngine( - model_name="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=32 - ) - dataset = load_dataset( card="cards.doc_vqa.lmms_eval", template="templates.qa.with_context.title", format="formats.chat_api", - loader_limit=300, + loader_limit=10, augmentor="augmentors.image.grey_scale", split="test", ) + inference_model = HFLlavaInferenceEngine( + model_name="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=32 + ) + predictions = inference_model.infer(dataset) evaluated_dataset = evaluate(predictions=predictions, data=dataset) diff --git a/prepare/tasks/qa/multiple_choice/tasks.py b/prepare/tasks/qa/multiple_choice/tasks.py index cf8952e29..8fe83cad0 100644 --- a/prepare/tasks/qa/multiple_choice/tasks.py +++ b/prepare/tasks/qa/multiple_choice/tasks.py @@ -14,6 +14,7 @@ }, reference_fields={"answer": Union[int, str], "choices": List[str]}, prediction_type=str, + augmentable_inputs=["context", "question"], metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.with_context", @@ -26,6 +27,7 @@ input_fields={"topic": str, "question": str, "choices": List[str]}, reference_fields={"answer": Union[int, str], "choices": List[str]}, prediction_type=str, + augmentable_inputs=["topic", "question"], metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.with_topic", @@ -37,6 +39,7 @@ input_fields={"question": str, "choices": List[str]}, reference_fields={"answer": Union[int, str], "choices": List[str]}, prediction_type=str, + augmentable_inputs=["question"], metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.open", @@ -54,6 +57,7 @@ }, reference_fields={"answer": Union[int, str], "choices": List[str]}, prediction_type=str, + augmentable_inputs=["context", "question"], metrics=["metrics.accuracy"], ), "tasks.qa.multiple_choice.with_context.with_topic", diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/open.json b/src/unitxt/catalog/tasks/qa/multiple_choice/open.json index 1cd21924d..a6422737f 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/open.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/open.json @@ -9,6 +9,9 @@ "choices": "List[str]" }, "prediction_type": "str", + "augmentable_inputs": [ + "question" + ], "metrics": [ "metrics.accuracy" ] diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json index a22346778..be5de61a5 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context.json @@ -11,6 +11,10 @@ "choices": "List[str]" }, "prediction_type": "str", + "augmentable_inputs": [ + "context", + "question" + ], "metrics": [ "metrics.accuracy" ] diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json index 97895cc35..5bb4cbb29 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_context/with_topic.json @@ -12,6 +12,10 @@ "choices": "List[str]" }, "prediction_type": "str", + "augmentable_inputs": [ + "context", + "question" + ], "metrics": [ "metrics.accuracy" ] diff --git a/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json b/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json index 24e86e13a..da7184663 100644 --- a/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json +++ b/src/unitxt/catalog/tasks/qa/multiple_choice/with_topic.json @@ -10,6 +10,10 @@ "choices": "List[str]" }, "prediction_type": "str", + "augmentable_inputs": [ + "topic", + "question" + ], "metrics": [ "metrics.accuracy" ] diff --git a/src/unitxt/standard.py b/src/unitxt/standard.py index a8dbe46ba..6982b2c7c 100644 --- a/src/unitxt/standard.py +++ b/src/unitxt/standard.py @@ -3,12 +3,11 @@ from .artifact import fetch_artifact from .augmentors import ( Augmentor, - NullAugmentor, - TaskInputsAugmentor, ) from .card import TaskCard from .collections_operators import GetLength from .dataclass import Field, InternalField, NonPositionalField, OptionalField +from .error_utils import UnitxtError from .formats import Format, SystemFormat from .logging_utils import get_logger from .operator import SequentialOperator, SourceSequentialOperator, StreamingOperator @@ -69,9 +68,7 @@ class BaseRecipe(Recipe, SourceSequentialOperator): demos_field: str = "demos" sampler: Sampler = None - augmentor: Union[Augmentor, List[Augmentor]] = OptionalField( - default_factory=NullAugmentor - ) + augmentor: Union[Augmentor, List[Augmentor]] = OptionalField(default=None) steps: List[StreamingOperator] = InternalField(default_factory=list) @@ -308,11 +305,19 @@ def reset_pipeline(self): self.processing.steps.append(self.task) - if not isinstance(self.augmentor, list): - self.augmentor = [self.augmentor] + if self.augmentor is not None: + if ( + self.card.task.augmentable_inputs is None + or len(self.task.augmentable_inputs) == 0 + ): + raise UnitxtError( + f"You specified augmentor in the recipe but the got task without augmentable_inputs: {self.task}" + ) + + if not isinstance(self.augmentor, list): + self.augmentor = [self.augmentor] - for augmentor in self.augmentor: - if isinstance(augmentor, TaskInputsAugmentor): + for augmentor in self.augmentor: augmentor.set_fields(self.card.task.augmentable_inputs) self.processing.steps.append(augmentor) From b686f95f07c080d4ed790e356e5a3f336cc794aa Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 15:54:05 +0200 Subject: [PATCH 12/20] Change api to provider Signed-off-by: elronbandel --- ...valuate_benchmark_with_custom_provider.py} | 6 ++--- prepare/engines/multi_api/llama3.py | 12 ++-------- .../engines/model/llama_3_8b_instruct.json | 12 ++-------- src/unitxt/inference.py | 24 +++++++++---------- src/unitxt/settings_utils.py | 2 +- 5 files changed, 20 insertions(+), 36 deletions(-) rename examples/{evaluate_benchmark_with_custom_api.py => evaluate_benchmark_with_custom_provider.py} (77%) diff --git a/examples/evaluate_benchmark_with_custom_api.py b/examples/evaluate_benchmark_with_custom_provider.py similarity index 77% rename from examples/evaluate_benchmark_with_custom_api.py rename to examples/evaluate_benchmark_with_custom_provider.py index 379c6f605..371f97f51 100644 --- a/examples/evaluate_benchmark_with_custom_api.py +++ b/examples/evaluate_benchmark_with_custom_provider.py @@ -1,5 +1,5 @@ from unitxt import evaluate, load_dataset -from unitxt.inference import MultiAPIInferenceEngine +from unitxt.inference import CrossProviderModel from unitxt.text_utils import print_dict data = load_dataset( @@ -8,8 +8,8 @@ disable_cache=False, ) -model = MultiAPIInferenceEngine( - model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, api="watsonx" +model = CrossProviderModel( + model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, provider="watsonx" ) predictions = model.infer(data) diff --git a/prepare/engines/multi_api/llama3.py b/prepare/engines/multi_api/llama3.py index 8ebaa4adf..8b3ee4494 100644 --- a/prepare/engines/multi_api/llama3.py +++ b/prepare/engines/multi_api/llama3.py @@ -1,16 +1,8 @@ from unitxt.catalog import add_to_catalog -from unitxt.inference import MultiAPIInferenceEngine +from unitxt.inference import CrossProviderModel -engine = MultiAPIInferenceEngine( +engine = CrossProviderModel( model="llama-3-8b-instruct", - api_model_map={ - "watsonx": { - "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", - }, - "together-ai": { - "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct" - }, - }, ) add_to_catalog(engine, "engines.model.llama_3_8b_instruct", overwrite=True) diff --git a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json index a6c2be46c..ab9eee536 100644 --- a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json +++ b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json @@ -1,12 +1,4 @@ { - "__type__": "multi_api_inference_engine", - "model": "llama-3-8b-instruct", - "api_model_map": { - "watsonx": { - "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct" - }, - "together-ai": { - "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct" - } - } + "__type__": "cross_provider_model", + "model": "llama-3-8b-instruct" } diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 5acb50d01..15a308c6f 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1669,8 +1669,8 @@ def _infer( _supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama"] -class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): - """Inference engine capable of dynamically switching between multiple APIs. +class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin): + """Inference engine capable of dynamically switching between multiple providers APIs. This class extends the InferenceEngine and OpenAiInferenceEngineParamsMixin to enable seamless integration with various API providers. The supported APIs are @@ -1687,9 +1687,9 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): across different API backends. """ - api: Optional[_supported_apis] = None + provider: Optional[_supported_apis] = None - api_model_map: Dict[_supported_apis, Dict[str, str]] = { + provider_model_map: Dict[_supported_apis, Dict[str, str]] = { "watsonx": { "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct", @@ -1708,7 +1708,7 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): }, } - _api_to_base_class = { + _provider_to_base_class = { "watsonx": LiteLLMInferenceEngine, "open-ai": LiteLLMInferenceEngine, "together-ai": LiteLLMInferenceEngine, @@ -1716,14 +1716,14 @@ class MultiAPIInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "ollama": OllamaInferenceEngine, } - def get_api_name(self): - return self.api if self.api is not None else settings.default_inference_api + def get_provider_name(self): + return self.provider if self.provider is not None else settings.default_provider def prepare_engine(self): - api = self.get_api_name() - cls = self.__class__._api_to_base_class[api] + provider = self.get_provider_name() + cls = self.__class__._provider_to_base_class[provider] args = self.to_dict([StandardAPIParamsMixin]) - args["model"] = self.api_model_map[api][self.model] + args["model"] = self.provider_model_map[provider][self.model] self.engine = cls(**args) def _infer( @@ -1734,8 +1734,8 @@ def _infer( return self.engine._infer(dataset, return_meta_data) def get_engine_id(self): - api = self.get_api_name() - return get_model_and_label_id(self.api_model_map[api][self.model], api) + api = self.get_provider_name() + return get_model_and_label_id(self.provider_model_map[api][self.model], api) class HFOptionSelectingInferenceEngine(InferenceEngine): diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py index 9a03cf81e..a95cacfe3 100644 --- a/src/unitxt/settings_utils.py +++ b/src/unitxt/settings_utils.py @@ -152,7 +152,7 @@ def __getattr__(self, key): settings.disable_hf_datasets_cache = (bool, True) settings.loader_cache_size = (int, 1) settings.task_data_as_text = (bool, True) - settings.default_inference_api = "watsonx" + settings.default_provider = "watsonx" settings.default_format = None if Constants.is_uninitilized(): From 4c91d5ed3987b976912a05d17f473c5b3801d9ec Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 Nov 2024 16:03:12 +0200 Subject: [PATCH 13/20] Added support for param renaming. Added BAM and improved error messages. Signed-off-by: Yoav Katz --- src/unitxt/inference.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 15a308c6f..852d5aa58 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -18,6 +18,7 @@ from .artifact import Artifact, fetch_artifact from .dataclass import InternalField, NonPositionalField from .deprecation_utils import deprecation +from .error_utils import UnitxtError from .image_operators import data_url_to_image, extract_images from .logging_utils import get_logger from .operator import PackageRequirementsMixin @@ -1589,7 +1590,7 @@ class LiteLLMInferenceEngine( max_requests_per_second: float = 6 max_retries: int = 5 # Set to 0 to prevent internal retries - requirements: list = ["litellm", "tenacity", "tqdm", "diskcache"] + _requirements_list: list = ["litellm", "tenacity", "tqdm", "diskcache"] def prepare_engine(self): # Initialize the token bucket rate limiter @@ -1693,6 +1694,7 @@ class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin): "watsonx": { "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct", + "granite-3-8b-instruct": "watsonx/ibm/granite-3-8b-instruct", }, "together-ai": { "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct", @@ -1706,6 +1708,10 @@ class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin): "llama-3-8b-instruct": "llama3:8b", "llama-3-70b-instruct": "llama3:70b", }, + "bam": { + "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k", + "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct", + }, } _provider_to_base_class = { @@ -1714,6 +1720,11 @@ class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin): "together-ai": LiteLLMInferenceEngine, "aws": LiteLLMInferenceEngine, "ollama": OllamaInferenceEngine, + "bam": IbmGenAiInferenceEngine, + } + + _provider_param_renaming = { + "bam": {"max_tokens": "max_new_tokens", "model": "model_name"} } def get_provider_name(self): @@ -1721,9 +1732,26 @@ def get_provider_name(self): def prepare_engine(self): provider = self.get_provider_name() + if provider not in self._provider_to_base_class: + raise UnitxtError( + f"{provider} a known API. Supported apis: {','.join(self.provider_model_map.keys())}" + ) + if self.model not in self.provider_model_map[api]: + raise UnitxtError( + f"{self.model} is not configured for provider {provider}. Supported models: {','.join(self.provider_model_map[api].keys())}" + ) cls = self.__class__._provider_to_base_class[provider] args = self.to_dict([StandardAPIParamsMixin]) - args["model"] = self.provider_model_map[provider][self.model] + args["model"] = self.provider_model_map[provider][self.model] + params = list(args.keys()) + if provider in self._provider_param_renaming: + for param in params: + if args[param] is not None: + if param in self._provider_param_renaming[provider]: + args[self._provider_param_renaming[provider][param]] = args[param] + del args[param] + else: + del args[param] self.engine = cls(**args) def _infer( From eaead52cd366dc5201b2670675074dfaf3655cb6 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 Nov 2024 16:23:40 +0200 Subject: [PATCH 14/20] Fix merge issues Signed-off-by: Yoav Katz --- src/unitxt/inference.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 852d5aa58..5e41946f3 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1736,19 +1736,21 @@ def prepare_engine(self): raise UnitxtError( f"{provider} a known API. Supported apis: {','.join(self.provider_model_map.keys())}" ) - if self.model not in self.provider_model_map[api]: + if self.model not in self.provider_model_map[provider]: raise UnitxtError( - f"{self.model} is not configured for provider {provider}. Supported models: {','.join(self.provider_model_map[api].keys())}" + f"{self.model} is not configured for provider {provider}. Supported models: {','.join(self.provider_model_map[provider].keys())}" ) cls = self.__class__._provider_to_base_class[provider] args = self.to_dict([StandardAPIParamsMixin]) - args["model"] = self.provider_model_map[provider][self.model] + args["model"] = self.provider_model_map[provider][self.model] params = list(args.keys()) if provider in self._provider_param_renaming: for param in params: if args[param] is not None: if param in self._provider_param_renaming[provider]: - args[self._provider_param_renaming[provider][param]] = args[param] + args[self._provider_param_renaming[provider][param]] = args[ + param + ] del args[param] else: del args[param] From 4c5ba45aa65c27fab49f56e443553637ad321565 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Mon, 18 Nov 2024 16:25:35 +0200 Subject: [PATCH 15/20] Updated to CrossProviderModel Signed-off-by: Yoav Katz --- ...luate_batched_multiclass_classification.py | 184 ++++++++++-------- 1 file changed, 98 insertions(+), 86 deletions(-) diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py index 8c3c4382d..d803ebad5 100644 --- a/examples/evaluate_batched_multiclass_classification.py +++ b/examples/evaluate_batched_multiclass_classification.py @@ -28,7 +28,7 @@ class ParseEnumeratedList(FieldOperator): def process_value(self, text: Any) -> Any: result = [] for x in text.split("\n"): - line_result = re.findall(r"(\d+)\.\s*(\w+)", x) + line_result = re.findall(r"(\d+)\.\s*(.*)", x) if len(line_result) == 1: result.append(line_result[0]) return result @@ -63,96 +63,108 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str: template = InputOutputTemplate( input_format="Classify each of the texts to its corresponding {type_of_class} from one of these options:\n{classes}\nReturn for each index the correspond class in a separate line.\nTexts:\n{texts}", - # target_prefix="Answer:\n", + target_prefix="Answer:\n", output_format="{labels}", - postprocessors=[PostProcess(ParseEnumeratedList())], + postprocessors=["processors.lower_case", PostProcess(ParseEnumeratedList())], serializer=MultiTypeSerializer(serializers=[EnumeratedListSerializer()]), ) df = pd.DataFrame( - columns=["model", "batch_size", "num_instances", "f1_micro", "ci_low", "ci_high"] + columns=[ + "provider", + "model", + "batch_size", + "num_instances", + "f1_micro", + "ci_low", + "ci_high", + "hellucinations", + ] ) -for model_name in [ - "ibm/granite-3-8b-instruct", - "meta-llama/llama-3-8b-instruct", +for provider in [ + "watsonx", + "bam", ]: - if model_name.startswith("ibm"): - format = SystemFormat( - demo_format=( - "{instruction}\\N{source}\\N<|end_of_text|>\n" - "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n" - "<|start_of_role|>user<|end_of_role|>" - ), - model_input_format=( - "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n" - "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n" - "<|start_of_role|>assistant<|end_of_role|>" - ), - ) - batch_sizes = [50, 30, 10, 1] - - if model_name.startswith("meta-llama"): - format = "formats.llama3_instruct" - batch_sizes = [100, 50, 10, 1] - - for batch_size in batch_sizes: - card, _ = fetch_artifact("cards.sst2") - card.preprocess_steps.extend( - [ - CollateInstances(batch_size=batch_size), - Rename(field_to_field={"text": "texts", "label": "labels"}), - Copy(field="text_type/0", to_field="text_type"), - Copy(field="classes/0", to_field="classes"), - Copy(field="type_of_class/0", to_field="type_of_class"), + for model_name in [ + "granite-3-8b-instruct", + "llama-3-8b-instruct", + ]: + batch_sizes = [30, 20, 10, 5, 1] + + for batch_size in batch_sizes: + card, _ = fetch_artifact("cards.banking77") + card.preprocess_steps.extend( + [ + CollateInstances(batch_size=batch_size), + Rename(field_to_field={"text": "texts", "label": "labels"}), + Copy(field="text_type/0", to_field="text_type"), + Copy(field="classes/0", to_field="classes"), + Copy(field="type_of_class/0", to_field="type_of_class"), + ] + ) + card.task = task + card.templates = [template] + format = "formats.chat_api" + if provider == "bam" and model_name.startswith("llama"): + format = "formats.llama3_instruct" + if provider == "bam" and model_name.startswith("granite"): + format = SystemFormat( + demo_format=( + "{instruction}\\N{source}\\N<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n" + "<|start_of_role|>user<|end_of_role|>" + ), + model_input_format=( + "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n" + "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>" + ), + ) + + dataset = load_dataset( + card=card, + template_card_index=0, + format=format, + num_demos=1, + demos_pool_size=5, + loader_limit=1000, + max_test_instances=200 / batch_size, + ) + + test_dataset = dataset["test"] + from unitxt.inference import CrossProviderModel + + inference_model = CrossProviderModel( + model=model_name, max_tokens=1024, provider=provider + ) + predictions = inference_model.infer(test_dataset) + + evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + # import pandas as pd + # result_df = pd.json_normalize(evaluated_dataset) + # result_df.to_csv(f"output.csv") + # Print results + print_dict( + evaluated_dataset[0], + keys_to_print=[ + "source", + "prediction", + "processed_prediction", + "processed_references", + ], + ) + + global_scores = evaluated_dataset[0]["score"]["global"] + df.loc[len(df)] = [ + provider, + model_name, + batch_size, + global_scores["num_of_instances"], + global_scores["score"], + global_scores["score_ci_low"], + global_scores["score_ci_high"], + 1.0 - global_scores["in_classes_support"], ] - ) - card.task = task - card.templates = [template] - - dataset = load_dataset( - card=card, - template_card_index=0, - format=format, - num_demos=1, - demos_pool_size=5, - loader_limit=10000, - max_test_instances=1000 / batch_size, - ) - - test_dataset = dataset["test"] - - # inference_model = IbmGenAiInferenceEngine( - # model_name=model_name, max_new_tokens=1024 - # ) - - from unitxt.inference import WMLInferenceEngine - - inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=1024) - - predictions = inference_model.infer(test_dataset) - - evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) - - # Print results - print_dict( - evaluated_dataset[0], - keys_to_print=[ - "source", - "prediction", - "processed_prediction", - "processed_references", - ], - ) - - global_scores = evaluated_dataset[0]["score"]["global"] - df.loc[len(df)] = [ - model_name, - batch_size, - global_scores["num_of_instances"], - global_scores["score"], - global_scores["score_ci_low"], - global_scores["score_ci_high"], - ] - - df = df.round(decimals=2) - logger.info(df.to_markdown()) + + df = df.round(decimals=2) + logger.info(df.to_markdown()) From 00dbd304ee38a691bec91aaaf888d504813d45cb Mon Sep 17 00:00:00 2001 From: elronbandel Date: Mon, 18 Nov 2024 18:16:33 +0200 Subject: [PATCH 16/20] Update name back to InferenceEngine terminology Signed-off-by: elronbandel --- examples/evaluate_batched_multiclass_classification.py | 4 ++-- examples/evaluate_benchmark_with_custom_provider.py | 4 ++-- prepare/engines/multi_api/llama3.py | 4 ++-- src/unitxt/catalog/engines/model/llama_3_8b_instruct.json | 2 +- src/unitxt/inference.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py index d803ebad5..924796594 100644 --- a/examples/evaluate_batched_multiclass_classification.py +++ b/examples/evaluate_batched_multiclass_classification.py @@ -132,9 +132,9 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str: ) test_dataset = dataset["test"] - from unitxt.inference import CrossProviderModel + from unitxt.inference import CrossProviderInferenceEngine - inference_model = CrossProviderModel( + inference_model = CrossProviderInferenceEngine( model=model_name, max_tokens=1024, provider=provider ) predictions = inference_model.infer(test_dataset) diff --git a/examples/evaluate_benchmark_with_custom_provider.py b/examples/evaluate_benchmark_with_custom_provider.py index 371f97f51..6e3e45f57 100644 --- a/examples/evaluate_benchmark_with_custom_provider.py +++ b/examples/evaluate_benchmark_with_custom_provider.py @@ -1,5 +1,5 @@ from unitxt import evaluate, load_dataset -from unitxt.inference import CrossProviderModel +from unitxt.inference import CrossProviderInferenceEngine from unitxt.text_utils import print_dict data = load_dataset( @@ -8,7 +8,7 @@ disable_cache=False, ) -model = CrossProviderModel( +model = CrossProviderInferenceEngine( model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, provider="watsonx" ) diff --git a/prepare/engines/multi_api/llama3.py b/prepare/engines/multi_api/llama3.py index 8b3ee4494..8a1272bad 100644 --- a/prepare/engines/multi_api/llama3.py +++ b/prepare/engines/multi_api/llama3.py @@ -1,7 +1,7 @@ from unitxt.catalog import add_to_catalog -from unitxt.inference import CrossProviderModel +from unitxt.inference import CrossProviderInferenceEngine -engine = CrossProviderModel( +engine = CrossProviderInferenceEngine( model="llama-3-8b-instruct", ) diff --git a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json index ab9eee536..ac8e9eac3 100644 --- a/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json +++ b/src/unitxt/catalog/engines/model/llama_3_8b_instruct.json @@ -1,4 +1,4 @@ { - "__type__": "cross_provider_model", + "__type__": "cross_provider_inference_engine", "model": "llama-3-8b-instruct" } diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 5e41946f3..e3e5f10d1 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1667,10 +1667,10 @@ def _infer( return [response.prediction for response in responses] -_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama"] +_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] -class CrossProviderModel(InferenceEngine, StandardAPIParamsMixin): +class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): """Inference engine capable of dynamically switching between multiple providers APIs. This class extends the InferenceEngine and OpenAiInferenceEngineParamsMixin From a0373f8f5392760486db917ef86924e85e3f4b96 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Tue, 19 Nov 2024 10:48:41 +0200 Subject: [PATCH 17/20] Align all examples with chat api and cross provider engines Signed-off-by: elronbandel --- docs/docs/examples.rst | 9 +-- docs/docs/inference.rst | 0 ..._judge_model_capabilities_on_arena_hard.py | 30 ++++---- examples/evaluate_a_model_using_arena_hard.py | 32 ++++---- ...luate_batched_multiclass_classification.py | 7 ++ examples/evaluate_benchmark.py | 15 +++- ...evaluate_benchmark_with_custom_provider.py | 7 ++ examples/evaluate_bluebench.py | 14 +++- .../evaluate_different_demo_selections.py | 17 +++-- examples/evaluate_different_formats.py | 15 +++- examples/evaluate_different_templates.py | 25 ++++--- ...aluate_existing_dataset_by_llm_as_judge.py | 70 ++++++++++-------- .../evaluate_existing_dataset_with_install.py | 45 ++++------- examples/evaluate_image_text_to_text.py | 4 +- ..._image_text_to_text_lmms_eval_inference.py | 1 + examples/evaluate_rag_response_generation.py | 23 ++++-- ...uate_summarization_dataset_llm_as_judge.py | 74 ++++++++++--------- examples/evaluate_using_metrics_ensemble.py | 19 +++-- examples/qa_evaluation.py | 33 +++++---- examples/run_generic_inference_engine.py | 52 ------------- .../standalone_evaluation_llm_as_judge.py | 39 +++++----- examples/standalone_qa_evaluation.py | 42 ++++------- .../llama_3_arena_hard_template.py | 23 +++++- .../llama_3_ibm_genai_generic_template.py | 68 +++++++++++------ prepare/recipes/bluebench.py | 2 +- .../{watsonx => }/template_arena_hard.json | 4 +- .../template_arena_hard.json | 12 +++ .../generic_single_turn.json | 13 ++++ .../generic_single_turn_with_reference.json | 13 ++++ src/unitxt/inference.py | 40 ++++++---- tests/examples/test_examples.py | 14 ++-- tests/inference/test_inference_engine.py | 20 +++++ 32 files changed, 444 insertions(+), 338 deletions(-) create mode 100644 docs/docs/inference.rst delete mode 100644 examples/run_generic_inference_engine.py rename src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/{watsonx => }/template_arena_hard.json (75%) create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst index b935ecff1..35267d34d 100644 --- a/docs/docs/examples.rst +++ b/docs/docs/examples.rst @@ -1,6 +1,6 @@ .. _examples: ============== -Examples +Examples ============== Here you will find complete coding samples showing how to perform different tasks using Unitxt. @@ -97,16 +97,16 @@ Related documentation: :ref:`Templates tutorial `, :ref:`Format Long Context +++++++++++++++++++++++++++++ -This example explores the effect of long context in classification. +This example explores the effect of long context in classification. It converts a standard multi class classification dataset (sst2 sentiment classification), where single sentence texts are classified one by one, to a dataset -where multiple sentences are classified using a single LLM call. +where multiple sentences are classified using a single LLM call. It compares the f1_micro in both approaches on two models. It uses serializers to verbalize and enumerated list of multiple sentences and labels. `Example code `_ -Related documentation: :ref:`Sst2 dataset card in catalog ` :ref:`Types and Serializers Guide `. +Related documentation: :ref:`Sst2 dataset card in catalog ` :ref:`Types and Serializers Guide `. Construct a benchmark of multiple datasets and obtain the final score +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -265,4 +265,3 @@ This example show how to define new data types as well as the way these data typ Related documentation: :ref:`Types and Serializers Guide `. - diff --git a/docs/docs/inference.rst b/docs/docs/inference.rst new file mode 100644 index 000000000..e69de29bb diff --git a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py index c8d91f9fa..4b608596a 100644 --- a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py +++ b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py @@ -1,32 +1,28 @@ from unitxt import evaluate, load_dataset -from unitxt.inference import MockInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine from unitxt.text_utils import print_dict -model_id = "meta-llama/llama-3-70b-instruct" -model_format = "formats.llama3_instruct" - """ -We are evaluating only on a small subset (by using "select(range(4)), in order for the example to finish quickly. +We are evaluating only on a small subset (by using `max_test_instances=4`), in order for the example to finish quickly. The dataset full size if around 40k examples. You should use around 1k-4k in your evaluations. """ dataset = load_dataset( card="cards.arena_hard.response_assessment.pairwise_comparative_rating.both_games_gpt_4_judge", template="templates.response_assessment.pairwise_comparative_rating.arena_hard_with_shuffling", - format=model_format, -)["test"].select(range(4)) + format="formats.chat_api", + max_test_instances=4, + split="test", +) -inference_model = MockInferenceEngine(model_name=model_id) +inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", provider="watsonx" +) """ -We are using a mock inference engine (and model) in order for the example to finish quickly. -In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following: -from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine) -and switch them with the MockInferenceEngine class in the example. -For the arguments these inference engines can receive, please refer to the classes documentation. +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. -Example of using an IBM model: -from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin) -params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42) -inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params) +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. """ predictions = inference_model.infer(dataset) diff --git a/examples/evaluate_a_model_using_arena_hard.py b/examples/evaluate_a_model_using_arena_hard.py index ce42fc38f..ad4410d29 100644 --- a/examples/evaluate_a_model_using_arena_hard.py +++ b/examples/evaluate_a_model_using_arena_hard.py @@ -1,35 +1,31 @@ from unitxt import evaluate, load_dataset -from unitxt.inference import MockInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine from unitxt.text_utils import print_dict -model_id = "meta-llama/llama-3-70b-instruct" -model_format = "formats.llama3_instruct" - """ -We are evaluating only on a small subset (by using "select(range(4)), in order for the example to finish quickly. +We are evaluating only on a small subset (by using `max_test_instances=4`), in order for the example to finish quickly. The dataset full size if around 40k examples. You should use around 1k-4k in your evaluations. """ dataset = load_dataset( card="cards.arena_hard.generation.english_gpt_4_0314_reference", template="templates.empty", - format=model_format, + format="formats.chat_api", metrics=[ - "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct_ibm_genai_template_arena_hard_with_shuffling" + "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct.template_arena_hard" ], -)["test"].select(range(4)) + max_test_instances=4, + split="test", +) -inference_model = MockInferenceEngine(model_name=model_id) +inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", provider="watsonx" +) """ -We are using a mock inference engine (and model) in order for the example to finish quickly. -In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following: -from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine) -and switch them with the MockInferenceEngine class in the example. -For the arguments these inference engines can receive, please refer to the classes documentation. +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. -Example of using an IBM model: -from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin) -params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42) -inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params) +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. """ predictions = inference_model.infer(dataset) diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py index 924796594..2a6540dc6 100644 --- a/examples/evaluate_batched_multiclass_classification.py +++ b/examples/evaluate_batched_multiclass_classification.py @@ -137,6 +137,13 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str: inference_model = CrossProviderInferenceEngine( model=model_name, max_tokens=1024, provider=provider ) + """ + We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: + watsonx, bam, openai, azure, aws and more. + + For the arguments these inference engines can receive, please refer to the classes documentation or read + about the the open ai api arguments the CrossProviderInferenceEngine follows. + """ predictions = inference_model.infer(test_dataset) evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) diff --git a/examples/evaluate_benchmark.py b/examples/evaluate_benchmark.py index e92b7e309..fd0990617 100644 --- a/examples/evaluate_benchmark.py +++ b/examples/evaluate_benchmark.py @@ -1,7 +1,7 @@ from unitxt.api import evaluate from unitxt.benchmark import Benchmark from unitxt.inference import ( - HFPipelineBasedInferenceEngine, + CrossProviderInferenceEngine, ) from unitxt.standard import StandardRecipe from unitxt.text_utils import print_dict @@ -47,10 +47,17 @@ test_dataset = list(benchmark()["test"]) -# Infere using flan t5 base using HF API -inference_model = HFPipelineBasedInferenceEngine( - model_name="google/flan-t5-base", max_new_tokens=32 +# Infere using llama-3-2-1b base using Watsonx API +inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", provider="watsonx" ) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. + +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" predictions = inference_model.infer(test_dataset) evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) diff --git a/examples/evaluate_benchmark_with_custom_provider.py b/examples/evaluate_benchmark_with_custom_provider.py index 6e3e45f57..d1b570b1c 100644 --- a/examples/evaluate_benchmark_with_custom_provider.py +++ b/examples/evaluate_benchmark_with_custom_provider.py @@ -11,6 +11,13 @@ model = CrossProviderInferenceEngine( model="llama-3-8b-instruct", temperature=0.0, top_p=1.0, provider="watsonx" ) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. + +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" predictions = model.infer(data) diff --git a/examples/evaluate_bluebench.py b/examples/evaluate_bluebench.py index 2268224c6..27efaaf9c 100644 --- a/examples/evaluate_bluebench.py +++ b/examples/evaluate_bluebench.py @@ -1,6 +1,6 @@ from unitxt import evaluate, load_dataset, settings from unitxt.inference import ( - LiteLLMInferenceEngine, + CrossProviderInferenceEngine, ) from unitxt.text_utils import print_dict @@ -11,11 +11,17 @@ test_dataset = load_dataset("benchmarks.bluebench", split="test") # Infer -inference_model = LiteLLMInferenceEngine( - model="watsonx/meta-llama/llama-3-8b-instruct", +inference_model = CrossProviderInferenceEngine( + model="llama-3-8b-instruct", max_tokens=30, - max_requests_per_second=6, ) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. + +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" predictions = inference_model.infer(test_dataset) evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) diff --git a/examples/evaluate_different_demo_selections.py b/examples/evaluate_different_demo_selections.py index d494b89f4..7c3c948e0 100644 --- a/examples/evaluate_different_demo_selections.py +++ b/examples/evaluate_different_demo_selections.py @@ -1,7 +1,7 @@ import pandas as pd from unitxt import get_logger from unitxt.api import evaluate, load_dataset -from unitxt.inference import IbmGenAiInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine from unitxt.splitters import CloseTextSampler, FixedIndicesSampler, RandomSampler from unitxt.text_utils import print_dict @@ -13,10 +13,16 @@ # CloseTextSampler - select the lexically closest amples from the demo pool for each test instance # FixedIndicesSampler - selec the same fixed set of demo examples for all instances -card = "cards.ledgar" -model_name = "google/flan-t5-xxl" -inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) +inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", max_tokens=32 +) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" df = pd.DataFrame(columns=["num_demos", "sampler", "f1_micro", "ci_low", "ci_high"]) @@ -27,8 +33,9 @@ FixedIndicesSampler(indices=[0, 1]), ]: dataset = load_dataset( - card=card, + card="cards.ledgar", template="templates.classification.multi_class.title", + format="formats.chat_api", num_demos=num_demos, demos_pool_size=50, loader_limit=200, diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py index f650e25c1..7605efdf7 100644 --- a/examples/evaluate_different_formats.py +++ b/examples/evaluate_different_formats.py @@ -1,14 +1,23 @@ import pandas as pd from unitxt import get_logger from unitxt.api import evaluate, load_dataset -from unitxt.inference import IbmGenAiInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine from unitxt.text_utils import print_dict logger = get_logger() -model_name = "meta-llama/llama-3-8b-instruct" -inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) +inference_model = CrossProviderInferenceEngine( + model="llama-3-8b-instruct", max_tokens=32, provider="bam" +) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. + +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" + card = "cards.boolq.classification" template = "templates.classification.multi_class.relation.default" diff --git a/examples/evaluate_different_templates.py b/examples/evaluate_different_templates.py index 15a0d8415..0ece537a5 100644 --- a/examples/evaluate_different_templates.py +++ b/examples/evaluate_different_templates.py @@ -4,7 +4,7 @@ import pandas as pd from unitxt import add_to_catalog, get_logger, register_local_catalog from unitxt.api import evaluate, load_dataset -from unitxt.inference import IbmGenAiInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine from unitxt.templates import InputOutputTemplate from unitxt.text_utils import print_dict @@ -58,10 +58,16 @@ def create_path_and_register_as_local_catalog(path): ) # Run inference on mnli (entailment task) on the two templates with both 0 and 3 shot in context learning. -card = "cards.mnli" -model_name = "google/flan-t5-xxl" -inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) +inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", max_tokens=32 +) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" df = pd.DataFrame(columns=["template", "num_demos", "f1_micro", "ci_low", "ci_high"]) @@ -71,18 +77,19 @@ def create_path_and_register_as_local_catalog(path): ]: for num_demos in [0, 3]: dataset = load_dataset( - card=card, + card="cards.mnli", template=template, + format="formats.chat_api", num_demos=num_demos, demos_pool_size=100, loader_limit=500, - max_test_instances=300, + max_test_instances=10, + split="test", ) - test_dataset = dataset["test"] + predictions = inference_model.infer(dataset) - predictions = inference_model.infer(test_dataset) - evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + evaluated_dataset = evaluate(predictions=predictions, data=dataset) logger.info( f"Sample input and output for template '{template}' and num_demos '{num_demos}':" diff --git a/examples/evaluate_existing_dataset_by_llm_as_judge.py b/examples/evaluate_existing_dataset_by_llm_as_judge.py index 78e4d8dd2..e1295940f 100644 --- a/examples/evaluate_existing_dataset_by_llm_as_judge.py +++ b/examples/evaluate_existing_dataset_by_llm_as_judge.py @@ -1,43 +1,51 @@ from unitxt import get_logger, get_settings, load_dataset from unitxt.api import evaluate from unitxt.inference import ( - HFPipelineBasedInferenceEngine, + CrossProviderInferenceEngine, ) from unitxt.text_utils import print_dict logger = get_logger() settings = get_settings() -settings.allow_unverified_code = True -# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog. -# We set loader_limit to 20 to reduce download time. -dataset = load_dataset( - card="cards.squad", - template="templates.qa.with_context.simple", - metrics=[ - "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn" - ], - loader_limit=20, -) -test_dataset = dataset["test"] +with settings.context(allow_unverified_code=True): + # Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog. + # We set loader_limit to 20 to reduce download time. + dataset = load_dataset( + card="cards.squad", + template="templates.qa.with_context.simple", + format="formats.chat_api", + metrics=[ + "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn" + ], + loader_limit=20, + max_test_instances=20, + split="test", + ) -# Infer a model to get predictions. -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 -) -predictions = inference_model.infer(test_dataset) + # Infer a model to get predictions. + inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", provider="watsonx" + ) + """ + We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: + watsonx, bam, openai, azure, aws and more. -# Evaluate the predictions using the defined metric. -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + For the arguments these inference engines can receive, please refer to the classes documentation or read + about the the open ai api arguments the CrossProviderInferenceEngine follows. + """ + predictions = inference_model.infer(dataset) -print_dict( - evaluated_dataset[0], - keys_to_print=[ - "source", - "prediction", - "processed_prediction", - "references", - "score", - ], -) + # Evaluate the predictions using the defined metric. + evaluated_dataset = evaluate(predictions=predictions, data=dataset) + + print_dict( + evaluated_dataset[0], + keys_to_print=[ + "source", + "prediction", + "processed_prediction", + "references", + "score", + ], + ) diff --git a/examples/evaluate_existing_dataset_with_install.py b/examples/evaluate_existing_dataset_with_install.py index 74389fdb2..5ee72ccae 100644 --- a/examples/evaluate_existing_dataset_with_install.py +++ b/examples/evaluate_existing_dataset_with_install.py @@ -1,49 +1,34 @@ from unitxt.api import evaluate, load_dataset -from unitxt.inference import HFPipelineBasedInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine from unitxt.text_utils import print_dict # Use the Unitxt APIs to load the wnli entailment dataset using the standard template in the catalog for relation task with 2-shot in-context learning. # We set loader_limit to 20 to limit reduce inference time. dataset = load_dataset( card="cards.wnli", + system_prompt="system_prompts.be_concise", template="templates.classification.multi_class.relation.default", + format="formats.chat_api", num_demos=2, demos_pool_size=10, loader_limit=20, + split="test", ) - -test_dataset = dataset["test"] - -# Infer using flan t5 base using HF API, can be replaced with any -# inference code. -# -# change to this to infer with IbmGenAI APIs: -# -# from unitxt.inference import IbmGenAiInferenceEngine -# inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# or this to infer using WML APIs: -# -# from unitxt.inference import WMLInferenceEngine -# inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# or to this to infer using OpenAI APIs: -# -# from unitxt.inference import OpenAiInferenceEngine -# inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# Note that to run with OpenAI APIs you need to change the loader specification, to -# define that your data can be sent to a public API: -# # loader=LoadFromDictionary(data=data,data_classification_policy=["public"]), -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +inference_model = CrossProviderInferenceEngine( + model="llama-3-2-1b-instruct", provider="watsonx" ) -predictions = inference_model.infer(test_dataset) +""" +We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as: +watsonx, bam, openai, azure, aws and more. + +For the arguments these inference engines can receive, please refer to the classes documentation or read +about the the open ai api arguments the CrossProviderInferenceEngine follows. +""" +predictions = inference_model.infer(dataset) -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results print_dict( diff --git a/examples/evaluate_image_text_to_text.py b/examples/evaluate_image_text_to_text.py index 1edbe02e6..6d8a209f9 100644 --- a/examples/evaluate_image_text_to_text.py +++ b/examples/evaluate_image_text_to_text.py @@ -15,11 +15,11 @@ split="test", ) - inference_model = HFLlavaInferenceEngine( + engine = HFLlavaInferenceEngine( model_name="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=32 ) - predictions = inference_model.infer(dataset) + predictions = engine.infer(dataset) evaluated_dataset = evaluate(predictions=predictions, data=dataset) print_dict( diff --git a/examples/evaluate_image_text_to_text_lmms_eval_inference.py b/examples/evaluate_image_text_to_text_lmms_eval_inference.py index 2d2ce4be4..458a6c471 100644 --- a/examples/evaluate_image_text_to_text_lmms_eval_inference.py +++ b/examples/evaluate_image_text_to_text_lmms_eval_inference.py @@ -17,6 +17,7 @@ dataset = load_dataset( card="cards.seed_bench", template="templates.qa.multiple_choice.with_context.lmms_eval", + format="formats.chat_api", loader_limit=30, split="test", ) diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py index 6b74e1cd1..dd9e9cb49 100644 --- a/examples/evaluate_rag_response_generation.py +++ b/examples/evaluate_rag_response_generation.py @@ -60,18 +60,25 @@ ) # Verbalize the dataset using the template -dataset = load_dataset(card=card, template_card_index="simple") -test_dataset = dataset["test"] +dataset = load_dataset( + card=card, + template_card_index="simple", + format="formats.chat_api", + split="test", + max_test_instances=10, +) -# Infere using flan t5 base using HF API -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] -predictions = inference_model.infer(test_dataset) -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +predictions = engine.infer(dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results for instance in evaluated_dataset: diff --git a/examples/evaluate_summarization_dataset_llm_as_judge.py b/examples/evaluate_summarization_dataset_llm_as_judge.py index ab2b6545c..305966c73 100644 --- a/examples/evaluate_summarization_dataset_llm_as_judge.py +++ b/examples/evaluate_summarization_dataset_llm_as_judge.py @@ -1,6 +1,7 @@ from unitxt import get_logger from unitxt.api import evaluate, load_dataset from unitxt.inference import ( + CrossProviderInferenceEngine, HFPipelineBasedInferenceEngine, ) from unitxt.llm_as_judge import LLMAsJudge @@ -8,15 +9,18 @@ from unitxt.text_utils import print_dict logger = get_logger() + # First, we define the judge template. judge_summary_rating_template = InputOutputTemplate( - instruction="Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n" - 'You must respond according the following format: "[[rate]] - explanation".\n' - 'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n' - "The explanation describe shortly why you decided to give the rank you chosen.\n" - "Please make sure to start with your rank ([[rank]]) before anything else.\n" - "For example: [[9]] The summary catches the main text ideas." - ".\n\n", + instruction=( + "Please act as an impartial judge and evaluate if the assistant's summary summarise well the given text.\n" + 'You must respond according the following format: "[[rate]] - explanation".\n' + 'Were the rate is a score between 0 to 10 (10 for great summary, 0 for a very poor one)".\n' + "The explanation describe shortly why you decided to give the rank you chosen.\n" + "Please make sure to start with your rank ([[rank]]) before anything else.\n" + "For example: [[9]] The summary catches the main text ideas." + ".\n\n" + ), input_format="[Text:\n{question}\n\n" "Assistant's summary:\n{answer}\n", output_format="[[{rating}]]", postprocessors=[ @@ -24,24 +28,19 @@ ], ) -# Second, we define the inference engine we use for judge, with the preferred model and platform. -platform = "hf" -model_name = "google/flan-t5-large" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=256, use_fp16=True +# Second, we define the inference engine we use for judge, with the preferred model and provider. +# You can change the provider to any of: "watsonx", "together-ai", "open-ai", "aws", "ollama", "bam" +inference_model = CrossProviderInferenceEngine( + model="llama-3-8b-instruct", provider="watsonx" ) -# change to this to infer with IbmGenAI APIs: -# -# platform = 'ibm_gen_ai' -# model_name = 'meta-llama/llama-3-70b-instruct' -# inference_model = IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", max_new_tokens=512) # Third, We define the metric as LLM as a judge, with the desired platform and model. llm_judge_metric = LLMAsJudge( inference_model=inference_model, template=judge_summary_rating_template, + format="formats.chat_api", task="rating.single_turn", - main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}", + main_score="llm_judge_llama_3_8b", strip_system_prompt_and_format_from_inputs=False, ) @@ -51,19 +50,21 @@ template="templates.summarization.abstractive.formal", metrics=[llm_judge_metric], loader_limit=5, + split="test", ) -test_dataset = dataset["test"] - -# Infer a model to get predictions. -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) -predictions = inference_model.infer(test_dataset) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] + +predictions = engine.infer(dataset) # Evaluate the predictions using the defined metric. -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results print_dict( @@ -106,7 +107,7 @@ inference_model=inference_model, template=judge_summary_rating_with_reference_template, task="rating.single_turn_with_reference", - main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}", + main_score="llm_judge_llama_3_2_1b_hf", single_reference_per_prediction=True, strip_system_prompt_and_format_from_inputs=False, ) @@ -115,21 +116,24 @@ dataset = load_dataset( card="cards.xsum", template="templates.summarization.abstractive.formal", + format="formats.chat_api", metrics=[llm_judge_with_summary_metric], loader_limit=5, + split="test", ) -test_dataset = dataset["test"] - -# Infer a model to get predictions. -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) -predictions = inference_model.infer(test_dataset) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] + +predictions = engine.infer(dataset) # Evaluate the predictions using the defined metric. -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results print_dict( diff --git a/examples/evaluate_using_metrics_ensemble.py b/examples/evaluate_using_metrics_ensemble.py index ee99ec8de..2422824f2 100644 --- a/examples/evaluate_using_metrics_ensemble.py +++ b/examples/evaluate_using_metrics_ensemble.py @@ -21,20 +21,25 @@ dataset = load_dataset( card="cards.squad", template="templates.qa.with_context.simple", + format="formats.chat_api", metrics=[ensemble_metric], loader_limit=20, + max_test_instances=10, + split="test", ) -test_dataset = dataset["test"] -# Infer a model to get predictions. -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) -predictions = inference_model.infer(test_dataset) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] + +predictions = engine.infer(dataset) # Evaluate the predictions using the defined metric. -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results for instance in evaluated_dataset: diff --git a/examples/qa_evaluation.py b/examples/qa_evaluation.py index fb8bbfff7..4771036aa 100644 --- a/examples/qa_evaluation.py +++ b/examples/qa_evaluation.py @@ -39,25 +39,26 @@ # What is the color of the sky? # Answer: # " -dataset = load_dataset(card=card, template="templates.qa.open.title") -test_dataset = dataset["test"] +dataset = load_dataset( + card=card, + template="templates.qa.open.title", + format="formats.chat_api", + split="test", + max_test_instances=5, +) -# Infer using flan t5 base using HF API -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) -# change to this to infer with IbmGenAI APIs: -# -# inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# or to this to infer using OpenAI APIs: -# -# inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32) -# -predictions = inference_model.infer(test_dataset) -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] + + +predictions = engine.infer(dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results for instance in evaluated_dataset: diff --git a/examples/run_generic_inference_engine.py b/examples/run_generic_inference_engine.py deleted file mode 100644 index b234e6467..000000000 --- a/examples/run_generic_inference_engine.py +++ /dev/null @@ -1,52 +0,0 @@ -from unitxt import get_logger, produce # Import necessary functions from unitxt -from unitxt.inference import GenericInferenceEngine # Import the inference engine class - -if __name__ == "__main__": - # Create an instance of the GenericInferenceEngine with a default engine. - # This means if no engine is specified during inference, it will default to this one. - generic_engine_with_default = GenericInferenceEngine( - default="engines.ibm_gen_ai.llama_3_70b_instruct" - ) - - # Define the recipe for data processing and model selection. - # - card: Specifies the underlying data (from cards.almost_evil). - # - template: Selects the specific template within the card (from templates.qa.open.simple). - # - demos_pool_size and num_demos: Control the number of demonstration examples used (set to 0 here). - recipe = "card=cards.almost_evil,template=templates.qa.open.simple,demos_pool_size=0,num_demos=0" - - # Create a list of instances (data points) for inference. - # Each instance has a "question" and its corresponding "answers". - instances = [ - { - "question": "How many days there are in a week, answer only with numerals", - "answers": ["7"], - }, - { - "question": "If a ate an apple in the morning, and one in the evening, what is the number of apples I have eaten?, answer only with numerals", - "answers": ["2"], - }, - ] - - # Process the instances using the defined recipe. - # This likely formats the data according to the chosen card and template. - dataset = produce(instances, recipe) - - # Perform inference on the processed dataset using the engine with the default model. - predictions = generic_engine_with_default.infer(dataset) - get_logger().info(predictions) # Log the predictions - - # The following code block demonstrates how to use the GenericInferenceEngine without specifying a - # default engine. It expects the engine to be defined in the UNITXT_INFERENCE_ENGINE environment variable. - try: - # Attempt to create an instance without a default engine. - generic_engine_without_default = GenericInferenceEngine() - - # Perform inference (will use the engine specified in the environment variable). - predictions = generic_engine_without_default.infer(dataset) - get_logger().info(predictions) # Log the predictions - except: - # Handle the case where the environment variable is not set. - get_logger().error( - "GenericInferenceEngine could not be initialized without a default since " - "UNITXT_INFERENCE_ENGINE environmental variable is not set." - ) diff --git a/examples/standalone_evaluation_llm_as_judge.py b/examples/standalone_evaluation_llm_as_judge.py index 1561d4d29..18b91b9b7 100644 --- a/examples/standalone_evaluation_llm_as_judge.py +++ b/examples/standalone_evaluation_llm_as_judge.py @@ -57,21 +57,22 @@ ) platform = "hf" -model_name = "google/flan-t5-large" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=256, use_fp16=True +model_name = "meta-llama/Llama-3.2-1B" + +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) -# change to this to infer with IbmGenAI APIs: -# -# platform = 'ibm_gen_ai' -# model_name = 'meta-llama/llama-3-70b-instruct' -# inference_model = IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", max_new_tokens=32) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] # Third, We define the metric as LLM as a judge, with the desired platform and model. llm_judge_metric = LLMAsJudge( - inference_model=inference_model, + inference_model=engine, template=judge_correctness_template, + format="formats.chat_api", task="rating.single_turn", main_score=f"llm_judge_{model_name.split('/')[1].replace('-', '_')}_{platform}", strip_system_prompt_and_format_from_inputs=False, @@ -98,18 +99,22 @@ ) # Convert card to a dataset -dataset = load_dataset(card=card, template_card_index="simple") -test_dataset = dataset["test"] +dataset = load_dataset( + card=card, + template_card_index="simple", + format="formats.chat_api", + split="test", + max_test_instances=10, +) -# Infer a model to get predictions. -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) -predictions = inference_model.infer(test_dataset) +predictions = engine.infer(dataset) # Evaluate the predictions using the defined metric. -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results for instance in evaluated_dataset: diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py index 2daf6c96b..cdbca8838 100644 --- a/examples/standalone_qa_evaluation.py +++ b/examples/standalone_qa_evaluation.py @@ -39,38 +39,26 @@ postprocessors=["processors.lower_case"], ) # Verbalize the dataset using the template -dataset = load_dataset(card=card, template=template) -test_dataset = dataset["test"] +dataset = load_dataset( + card=card, + template=template, + format="formats.chat_api", + split="test", + max_test_instances=10, +) -# Infere using flan t5 base using HF API -model_name = "google/flan-t5-base" -inference_model = HFPipelineBasedInferenceEngine( - model_name=model_name, max_new_tokens=32 +# Infer using Llama-3.2-1B base using HF API +engine = HFPipelineBasedInferenceEngine( + model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32 ) +# Change to this to infer with external APIs: +# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx") +# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] -# change to this to infer with IbmGenAI APIs: -# -# from unitxt.inference import IbmGenAiInferenceEngine -# inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# or this to infer using WML APIs: -# -# from unitxt.inference import WMLInferenceEngine -# inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# or to this to infer using OpenAI APIs: -# -# from unitxt.inference import OpenAiInferenceEngine -# inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32) -# -# Note that to run with OpenAI APIs you need to change the loader specification, to -# define that your data can be sent to a public API: -# -# loader=LoadFromDictionary(data=data,data_classification_policy=["public"]), -predictions = inference_model.infer(test_dataset) -evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) +predictions = engine.infer(dataset) +evaluated_dataset = evaluate(predictions=predictions, data=dataset) # Print results for instance in evaluated_dataset: diff --git a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py index b1ed0ad90..e6e596efa 100644 --- a/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py +++ b/prepare/metrics/llm_as_judge/pairwise_rating/llama_3_arena_hard_template.py @@ -1,8 +1,8 @@ from unitxt import add_to_catalog from unitxt.inference import ( + CrossProviderInferenceEngine, GenericInferenceEngine, IbmGenAiInferenceEngine, - LiteLLMInferenceEngine, WMLInferenceEngine, ) from unitxt.llm_as_judge import LLMAsJudge @@ -64,8 +64,8 @@ add_to_catalog( LLMAsJudge( - inference_model=LiteLLMInferenceEngine( - model="watsonx/meta-llama/llama-3-70b-instruct", + inference_model=CrossProviderInferenceEngine( + model="llama-3-70b-instruct", max_tokens=30, ), template="templates.response_assessment.pairwise_comparative_rating.arena_hard", @@ -73,6 +73,21 @@ format="formats.chat_api", main_score="llama_3_70b_instruct_template_arena_hard", ), - "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.watsonx.template_arena_hard", + "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.template_arena_hard", + overwrite=True, +) + +add_to_catalog( + LLMAsJudge( + inference_model=CrossProviderInferenceEngine( + model="llama-3-8b-instruct", + max_tokens=30, + ), + template="templates.response_assessment.pairwise_comparative_rating.arena_hard", + task="pairwise_comparative_rating.single_turn", + format="formats.chat_api", + main_score="llama_3_70b_instruct_template_arena_hard", + ), + "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_8b_instruct.template_arena_hard", overwrite=True, ) diff --git a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py index 931c17cac..71d37d916 100644 --- a/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py +++ b/prepare/metrics/llm_as_judge/rating/llama_3_ibm_genai_generic_template.py @@ -1,49 +1,75 @@ from unitxt import add_to_catalog -from unitxt.inference import IbmGenAiInferenceEngine +from unitxt.inference import CrossProviderInferenceEngine, IbmGenAiInferenceEngine from unitxt.llm_as_judge import LLMAsJudge from unitxt.random_utils import get_seed -model = "meta-llama/llama-3-70b-instruct" -format = "formats.llama3_instruct" -template = "templates.response_assessment.rating.generic_single_turn" - inference_model = IbmGenAiInferenceEngine( - model_name=model, max_new_tokens=252, random_seed=get_seed() + model_name="meta-llama/llama-3-70b-instruct", + max_new_tokens=252, + random_seed=get_seed(), ) -model_label = model.split("/")[1].replace("-", "_").replace(".", ",").lower() -model_label = f"{model_label}_ibm_genai" -template_label = template.split(".")[-1] -metric_label = f"{model_label}_template_{template_label}" + metric = LLMAsJudge( inference_model=inference_model, - template=template, + template="templates.response_assessment.rating.generic_single_turn", task="rating.single_turn", - format=format, - main_score=metric_label, + format="formats.llama3_instruct", + main_score="llama_3_70b_instruct_ibm_genai_template_generic_single_turn", prediction_type=str, ) add_to_catalog( metric, - f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}", + "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn", + overwrite=True, +) + +metric = LLMAsJudge( + inference_model=inference_model, + template="templates.response_assessment.rating.generic_single_turn_with_reference", + task="rating.single_turn_with_reference", + format="formats.llama3_instruct", + single_reference_per_prediction=True, + main_score="llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference", +) + +add_to_catalog( + metric, + "metrics.llm_as_judge.rating.llama_3_70b_instruct_ibm_genai_template_generic_single_turn_with_reference", overwrite=True, ) -template = "templates.response_assessment.rating.generic_single_turn_with_reference" -template_label = template.split(".")[-1] -metric_label = f"{model_label}_template_{template_label}" +inference_model = CrossProviderInferenceEngine( + model="llama-3-70b-instruct", max_tokens=252 +) + +metric = LLMAsJudge( + inference_model=inference_model, + template="templates.response_assessment.rating.generic_single_turn", + task="rating.single_turn", + format="formats.chat_api", + main_score="llama_3_70b_instruct_template_generic_single_turn", + prediction_type=str, +) + +add_to_catalog( + metric, + "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn", + overwrite=True, +) + metric = LLMAsJudge( inference_model=inference_model, - template=template, + template="templates.response_assessment.rating.generic_single_turn_with_reference", task="rating.single_turn_with_reference", - format=format, + format="formats.chat_api", single_reference_per_prediction=True, - main_score=metric_label, + main_score="llama_3_70b_instruct_template_generic_single_turn_with_reference", ) add_to_catalog( metric, - f"metrics.llm_as_judge.rating.{model_label}_template_{template_label}", + "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn_with_reference", overwrite=True, ) diff --git a/prepare/recipes/bluebench.py b/prepare/recipes/bluebench.py index fe513f09b..c51a31aa8 100644 --- a/prepare/recipes/bluebench.py +++ b/prepare/recipes/bluebench.py @@ -129,7 +129,7 @@ def prepare_recipe(default_args, specific_args): "num_demos": 0, "template": "templates.empty", "metrics": [ - "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.watsonx.template_arena_hard" + "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.template_arena_hard" ], } recipe = prepare_recipe(default_args, ingridients) diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/watsonx/template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/template_arena_hard.json similarity index 75% rename from src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/watsonx/template_arena_hard.json rename to src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/template_arena_hard.json index 8e70b18cb..e60b9b895 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/watsonx/template_arena_hard.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_70b_instruct/template_arena_hard.json @@ -1,8 +1,8 @@ { "__type__": "llm_as_judge", "inference_model": { - "__type__": "lite_llm_inference_engine", - "model": "watsonx/meta-llama/llama-3-70b-instruct", + "__type__": "cross_provider_inference_engine", + "model": "llama-3-70b-instruct", "max_tokens": 30 }, "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard", diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json new file mode 100644 index 000000000..beec420a7 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/pairwise_comparative_rating/llama_3_8b_instruct/template_arena_hard.json @@ -0,0 +1,12 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "cross_provider_inference_engine", + "model": "llama-3-8b-instruct", + "max_tokens": 30 + }, + "template": "templates.response_assessment.pairwise_comparative_rating.arena_hard", + "task": "pairwise_comparative_rating.single_turn", + "format": "formats.chat_api", + "main_score": "llama_3_70b_instruct_template_arena_hard" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json new file mode 100644 index 000000000..ff1f9e216 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "cross_provider_inference_engine", + "model": "llama-3-70b-instruct", + "max_tokens": 252 + }, + "template": "templates.response_assessment.rating.generic_single_turn", + "task": "rating.single_turn", + "format": "formats.chat_api", + "main_score": "llama_3_70b_instruct_template_generic_single_turn", + "prediction_type": "str" +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json new file mode 100644 index 000000000..24b17f145 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/rating/llama_3_70b_instruct/generic_single_turn_with_reference.json @@ -0,0 +1,13 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "cross_provider_inference_engine", + "model": "llama-3-70b-instruct", + "max_tokens": 252 + }, + "template": "templates.response_assessment.rating.generic_single_turn_with_reference", + "task": "rating.single_turn_with_reference", + "format": "formats.chat_api", + "single_reference_per_prediction": true, + "main_score": "llama_3_70b_instruct_template_generic_single_turn_with_reference" +} diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index e3e5f10d1..9a5d61671 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -221,6 +221,7 @@ class HFPipelineBasedInferenceEngine( model_name: str max_new_tokens: int use_fp16: bool = True + batch_size: int = 1 _requirements_list = { "transformers": "Install huggingface package using 'pip install --upgrade transformers" @@ -229,9 +230,20 @@ class HFPipelineBasedInferenceEngine( def get_engine_id(self): return get_model_and_label_id(self.model_name, "hf_pipeline") + def _get_task(self): + from transformers import AutoConfig + + return ( + "text2text-generation" + if AutoConfig.from_pretrained( + self.model_name, trust_remote_code=True + ).is_encoder_decoder + else "text-generation" + ) + def _prepare_pipeline(self): import torch - from transformers import AutoConfig, pipeline + from transformers import pipeline model_args: Dict[str, Any] = ( {"torch_dtype": torch.float16} if self.use_fp16 else {} @@ -254,13 +266,7 @@ def _prepare_pipeline(self): else: model_args.update({"device": device}) - task = ( - "text2text-generation" - if AutoConfig.from_pretrained( - self.model_name, trust_remote_code=True - ).is_encoder_decoder - else "text-generation" - ) + task = self._get_task() if task == "text-generation": model_args.update({"return_full_text": False}) @@ -281,13 +287,16 @@ def _infer( dataset: Union[List[Dict[str, Any]], DatasetDict], return_meta_data: bool = False, ) -> Union[List[str], List[TextGenerationInferenceOutput]]: - self.verify_not_chat_api(dataset) + if self._get_task() == "text2text-generation": + self.verify_not_chat_api(dataset) if not self._is_loaded(): self._prepare_pipeline() outputs = [] - for output in self.model([instance["source"] for instance in dataset]): + for output in self.model( + [instance["source"] for instance in dataset], batch_size=self.batch_size + ): if isinstance(output, list): output = output[0] outputs.append(output["generated_text"]) @@ -1649,7 +1658,7 @@ async def _infer_async( ] # Use tqdm_asyncio.gather to display progress bar return await tqdm_asyncio.gather( - *tasks, desc="LiteLLM Inference", total=len(tasks) + *tasks, desc=f"LiteLLM Inference ({self.model})", total=len(tasks) ) def _infer( @@ -1681,9 +1690,9 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): user requests. Attributes: - api: Optional; Specifies the current API in use. Must be one of the + provider: Optional; Specifies the current API in use. Must be one of the literals in `_supported_apis`. - api_model_map: Dictionary mapping each supported API to a corresponding + provider_model_map: Dictionary mapping each supported API to a corresponding model identifier string. This mapping allows consistent access to models across different API backends. """ @@ -1695,10 +1704,13 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct", "granite-3-8b-instruct": "watsonx/ibm/granite-3-8b-instruct", + "flan-t5-xxl": "watsonx/google/flan-t5-xxl", + "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct", }, "together-ai": { "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct", "llama-3-70b-instruct": "together_ai/togethercomputer/llama-3-70b-instruct", + "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct", }, "aws": { "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0", @@ -1711,6 +1723,8 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "bam": { "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k", "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct", + "llama-3-2-1b-instruct": "meta-llama/llama-3-2-1b-instruct", + "flan-t5-xxl": "google/flan-t5-xxl", }, } diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py index eedf4f98b..49e21fffe 100644 --- a/tests/examples/test_examples.py +++ b/tests/examples/test_examples.py @@ -24,11 +24,11 @@ # "use_llm_as_judge_metric.py", # "standalone_evaluation_llm_as_judge.py", # "evaluate_summarization_dataset_llm_as_judge.py", - "evaluate_different_formats.py", - "evaluate_different_templates.py", - "evaluate_different_demo_selections.py", - "evaluate_a_judge_model_capabilities_on_arena_hard.py", - "evaluate_a_model_using_arena_hard.py", + # "evaluate_different_formats.py", + # "evaluate_different_templates.py", + # "evaluate_different_demo_selections.py", + # "evaluate_a_judge_model_capabilities_on_arena_hard.py", + # "evaluate_a_model_using_arena_hard.py", # "evaluate_llm_as_judge.py", "evaluate_using_metrics_ensemble.py", "evaluate_existing_dataset_no_install.py", @@ -43,8 +43,8 @@ # "robustness_testing_for_vision_text_models.py", "evaluate_bluebench.py", "custom_type.py", - "evaluate_different_templates_num_demos.py", - "evaluate_existing_dataset_with_install.py", + # "evaluate_different_templates_num_demos.py", + # "evaluate_existing_dataset_with_install.py", "evaluate_batched_multiclass_classification.py", ] diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py index a1f81495a..9ad5113e4 100644 --- a/tests/inference/test_inference_engine.py +++ b/tests/inference/test_inference_engine.py @@ -232,3 +232,23 @@ def test_option_selecting_inference_engine_chat_api(self): self.assertEqual(predictions[0], "hello friend") self.assertEqual(predictions[1], "white.") + + def test_hugginface_pipeline_inference_engine_chat_api(self): + dataset = [ + { + "source": [{"role": "user", "content": "hi you!"}], + }, + { + "source": [{"role": "user", "content": "black or white?"}], + }, + ] + + engine = HFPipelineBasedInferenceEngine( + model_name="Qwen/Qwen2.5-0.5B-Instruct", + batch_size=1, + max_new_tokens=1, + ) + predictions = engine.infer(dataset) + + self.assertEqual(predictions[0], "Hello") + self.assertEqual(predictions[1], "As") From 4fa6f8e9a5a0ea8c78bb5977d59ae4d7ebb577ac Mon Sep 17 00:00:00 2001 From: elronbandel Date: Tue, 19 Nov 2024 10:59:17 +0200 Subject: [PATCH 18/20] Add vllm inference engine Signed-off-by: elronbandel --- src/unitxt/inference.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 9a5d61671..39c3d1e1d 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1556,6 +1556,37 @@ def _infer( return optimal_responses +class VLLMInferenceEngine( + InferenceEngine, PackageRequirementsMixin, StandardAPIParamsMixin +): + def prepare_engine(self): + from vllm import LLM, SamplingParams + + args = self.to_dict([StandardAPIParamsMixin]) + self.sampling_params = SamplingParams(**args) + self.llm = LLM(model=self.model) + + def _infer( + self, + dataset: Union[List[Dict[str, Any]], DatasetDict], + return_meta_data: bool = False, + ) -> Union[List[str], List[TextGenerationInferenceOutput]]: + inputs = [] + for instance in dataset: + inputs.append(instance["source"]) + + if isinstance(inputs[0], list): + outputs = self.llm.chat(inputs, self.sampling_params) + else: + outputs = self.llm.generate(inputs, self.sampling_params) + + predictions = [] + for output in outputs: + predictions.append(output.outputs[0].text) + + return predictions + + class AsyncTokenBucket: def __init__(self, rate, capacity): self.rate = rate # Tokens added per second From 81150912c319371acb88549ec7c7f32718830f02 Mon Sep 17 00:00:00 2001 From: elronbandel Date: Tue, 19 Nov 2024 11:00:55 +0200 Subject: [PATCH 19/20] Fix blue bench to use cross provider engine Signed-off-by: elronbandel --- .../arena_hard_generation_english_gpt_4_0314_reference.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json b/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json index 64871f548..577d51e38 100644 --- a/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json +++ b/src/unitxt/catalog/recipes/bluebench/chatbot_abilities/arena_hard_generation_english_gpt_4_0314_reference.json @@ -9,7 +9,7 @@ "card": "cards.arena_hard.generation.english_gpt_4_0314_reference", "template": "templates.empty", "metrics": [ - "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.watsonx.template_arena_hard" + "metrics.llm_as_judge.pairwise_comparative_rating.llama_3_70b_instruct.template_arena_hard" ], "format": "formats.chat_api" } From 728fcc301d315b649abe30ac52a1aef98b460d48 Mon Sep 17 00:00:00 2001 From: Yoav Katz Date: Tue, 19 Nov 2024 11:42:42 +0200 Subject: [PATCH 20/20] Added watsonx-sdk to MultiProviderInferenceEngine Add example to evaluate same datasets and models with multiple providers and formats Signed-off-by: Yoav Katz --- ...sets_and_models_with_multiple_providers.py | 101 ++++++++++++++++++ src/unitxt/inference.py | 13 ++- 2 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 examples/evaluate_same_datasets_and_models_with_multiple_providers.py diff --git a/examples/evaluate_same_datasets_and_models_with_multiple_providers.py b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py new file mode 100644 index 000000000..6f25b6e9a --- /dev/null +++ b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py @@ -0,0 +1,101 @@ +import pandas as pd +from unitxt import get_logger +from unitxt.api import evaluate, load_dataset +from unitxt.artifact import fetch_artifact +from unitxt.formats import SystemFormat +from unitxt.text_utils import print_dict + +logger = get_logger() + +df = pd.DataFrame( + columns=[ + "provider", + "model", + "format_as_chat_api", + "num_instances", + "score_name", + "score", + "ci_low", + "ci_high", + ] +) + +for provider in [ + "watsonx-sdk", + "watsonx", +]: + for model_name in [ + "granite-3-8b-instruct", + "llama-3-8b-instruct", + ]: + for format_as_chat_api in [True, False]: + if format_as_chat_api and provider == "watsonx-sdk": + continue + if format_as_chat_api: + format = "formats.chat_api" + else: + if model_name.startswith("llama"): + format = "formats.llama3_instruct" + if model_name.startswith("granite"): + format = SystemFormat( + demo_format=( + "{instruction}\\N{source}\\N<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n" + "<|start_of_role|>user<|end_of_role|>" + ), + model_input_format=( + "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n" + "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n" + "<|start_of_role|>assistant<|end_of_role|>" + ), + ) + card, _ = fetch_artifact("cards.sst2") + + dataset = load_dataset( + card=card, + template_card_index=0, + format=format, + num_demos=1, + demos_pool_size=100, + loader_limit=1000, + max_test_instances=500, + disable_cache=False, + ) + + test_dataset = dataset["test"] + from unitxt.inference import CrossProviderInferenceEngine + + inference_model = CrossProviderInferenceEngine( + model=model_name, max_tokens=1024, provider=provider + ) + predictions = inference_model.infer(test_dataset) + + evaluated_dataset = evaluate(predictions=predictions, data=test_dataset) + # import pandas as pd + # result_df = pd.json_normalize(evaluated_dataset) + # result_df.to_csv(f"output.csv") + # Print results + print_dict( + evaluated_dataset[0], + keys_to_print=[ + "source", + "prediction", + "processed_prediction", + "processed_references", + ], + ) + + global_scores = evaluated_dataset[0]["score"]["global"] + df.loc[len(df)] = [ + provider, + model_name, + format_as_chat_api, + global_scores["num_of_instances"], + global_scores["score_name"], + global_scores["score"], + global_scores["score_ci_low"], + global_scores["score_ci_high"], + ] + + df = df.round(decimals=2) + logger.info(df.to_markdown()) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 1748f71f3..f1f0b7c73 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -1708,7 +1708,9 @@ def _infer( return [response.prediction for response in responses] -_supported_apis = Literal["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"] +_supported_apis = Literal[ + "watsonx", "together-ai", "open-ai", "aws", "ollama", "bam", "watsonx-sdk" +] class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): @@ -1739,6 +1741,11 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "flan-t5-xxl": "watsonx/google/flan-t5-xxl", "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct", }, + "watsonx-sdk": { + "llama-3-8b-instruct": "meta-llama/llama-3-8b-instruct", + "llama-3-70b-instruct": "meta-llama/llama-3-70b-instruct", + "granite-3-8b-instruct": "ibm/granite-3-8b-instruct", + }, "together-ai": { "llama-3-8b-instruct": "together_ai/togethercomputer/llama-3-8b-instruct", "llama-3-70b-instruct": "together_ai/togethercomputer/llama-3-70b-instruct", @@ -1767,10 +1774,12 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "aws": LiteLLMInferenceEngine, "ollama": OllamaInferenceEngine, "bam": IbmGenAiInferenceEngine, + "watsonx-sdk": WMLInferenceEngine, } _provider_param_renaming = { - "bam": {"max_tokens": "max_new_tokens", "model": "model_name"} + "bam": {"max_tokens": "max_new_tokens", "model": "model_name"}, + "watsonx-sdk": {"max_tokens": "max_new_tokens", "model": "model_name"}, } def get_provider_name(self):