IBM · elronbandel · Sep 1, 2024 · Sep 1, 2024 · Sep 1, 2024 · Sep 2, 2024
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -3,7 +3,7 @@
     "files": "^.secrets.baseline$",
     "lines": null
   },
-  "generated_at": "2024-08-21T15:51:06Z",
+  "generated_at": "2024-09-02T13:01:17Z",
   "plugins_used": [
     {
       "name": "AWSKeyDetector"
@@ -82,7 +82,7 @@
         "hashed_secret": "fa172616e9af3d2a24b5597f264eab963fe76889",
         "is_secret": false,
         "is_verified": false,
-        "line_number": 1946,
+        "line_number": 1947,
         "type": "Hex High Entropy String",
         "verified_result": null
       }

diff --git a/docs/docs/installation.rst b/docs/docs/installation.rst
@@ -29,8 +29,8 @@ The core of Unitxt has minimal dependencies (none beyond HuggingFace evaluate).
 Note that specific metrics or other operators may required specific dependencies, which are checked before the first time they are used.
 An error message is printed if the there are missing installed dependencies.
 
-The benefit of using the HuggingFace API approach is that you can load a Unitxt dataset, just like every other HuggingFace dataset, 
-so it can be used in preexisting code without modifications.  
+The benefit of using the HuggingFace API approach is that you can load a Unitxt dataset, just like every other HuggingFace dataset,
+so it can be used in preexisting code without modifications.
 However, this incurs extra overhead when HuggingFace downloads the unitxt package and does not expose all unitxt capabilities
 (e.g., defining new datasets, metrics, templates, and more).
 
@@ -46,7 +46,7 @@ You can then use the API:
 .. code-block:: python
 
   from unitxt import load_dataset,evaluate
-  from unitxt.inference import HFPipelineBasedInferenceEngine
+  from unitxt.inference_engines import HFPipelineBasedInferenceEngine
 
   dataset = load_dataset('card=cards.wnli,template=templates.classification.multi_class.relation.default,max_test_instances=20')
   test_dataset = dataset["test"]
@@ -56,7 +56,7 @@ You can then use the API:
   predictions = inference_model.infer(test_dataset)
 
   dataset_with_scores = evaluate(predictions=predictions, data=test_dataset)
-  [print(item) for item in dataset_with_scores[0]['score']['global'].items()] 
+  [print(item) for item in dataset_with_scores[0]['score']['global'].items()]
 
 
 .. warning::
@@ -75,6 +75,6 @@ installed, and the versions are not compatible. To fix this issue, you should ch
 * Update the locally installed Unitxt
 to the Unitxt HuggingFace dataset version
 * Uninstall the local Unitxt package (in case you don't require the access to Unitxt
-direct APIs), or 
+direct APIs), or
 * Change the code to load the datasets using the direct Unitxt APIs without using the HuggingFace API.
 
diff --git a/docs/docs/llm_as_judge.rst b/docs/docs/llm_as_judge.rst
@@ -97,7 +97,7 @@ The following code performs the desired evaluation:
 .. code-block:: python
 
     from datasets import load_dataset
-    from unitxt.inference import HFPipelineBasedInferenceEngine
+    from unitxt.inference_engines import HFPipelineBasedInferenceEngine
     from unitxt import evaluate
 
     # 1. Create the dataset
@@ -142,7 +142,7 @@ Let's walk through an example of creating a new LLM as a Judge metric, specifica
 
     .. code-block:: python
 
-        from unitxt.inference import HFPipelineBasedInferenceEngine
+        from unitxt.inference_engines import HFPipelineBasedInferenceEngine
         from unitxt.llm_as_judge import LLMAsJudge
 
         model_id = "mistralai/Mistral-7B-Instruct-v0.2"
@@ -219,7 +219,7 @@ With these components defined, creating a new LLM as a Judge metric is straightf
 .. code-block:: python
 
     from unitxt import add_to_catalog
-    from unitxt.inference import HFPipelineBasedInferenceEngine
+    from unitxt.inference_engines import HFPipelineBasedInferenceEngine
     from unitxt.llm_as_judge import LLMAsJudge
 
     model_id = "mistralai/Mistral-7B-Instruct-v0.2"
@@ -341,7 +341,7 @@ and run it.
 .. code-block:: python
 
     from datasets import load_dataset
-    from unitxt.inference import HFPipelineBasedInferenceEngine
+    from unitxt.inference_engines import HFPipelineBasedInferenceEngine
     from unitxt import evaluate
 
     # 1. Create the dataset

diff --git a/docs/docs/multimodality.rst b/docs/docs/multimodality.rst
@@ -92,7 +92,7 @@ Set up the inference and evaluation pipeline:
 .. code-block:: python
 
     from unitxt.api import evaluate, load_dataset
-    from unitxt.inference import HFLlavaInferenceEngine
+    from unitxt.inference_engines import HFLlavaInferenceEngine
     from unitxt.text_utils import print_dict
 
     # Initialize the inference model

diff --git a/docs/docs/production.rst b/docs/docs/production.rst
@@ -116,7 +116,7 @@ You can also implement an end-to-end inference pipeline using your preferred dat
 .. code-block:: python
 
   from unitxt import infer
-  from unitxt.inference import HFPipelineBasedInferenceEngine
+  from unitxt.inference_engines import HFPipelineBasedInferenceEngine
 
   engine = HFPipelineBasedInferenceEngine(
       model_name="google/flan-t5-small", max_new_tokens=32

diff --git a/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py b/examples/evaluate_a_judge_model_capabilities_on_arena_hard.py
@@ -1,5 +1,5 @@
 from unitxt import evaluate, load_dataset
-from unitxt.inference import MockInferenceEngine
+from unitxt.inference_engines import MockInferenceEngine
 from unitxt.text_utils import print_dict
 
 model_id = "meta-llama/llama-3-70b-instruct"
@@ -19,12 +19,12 @@
 """
 We are using a mock inference engine (and model) in order for the example to finish quickly.
 In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following:
-from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine)
+from unitxt.inference_engines import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine)
 and switch them with the MockInferenceEngine class in the example.
 For the arguments these inference engines can receive, please refer to the classes documentation.
 
 Example of using an IBM model:
-from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin)
+from unitxt.inference_engines import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin)
 params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42)
 inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params)
 """

diff --git a/examples/evaluate_a_model_using_arena_hard.py b/examples/evaluate_a_model_using_arena_hard.py
@@ -1,5 +1,5 @@
 from unitxt import evaluate, load_dataset
-from unitxt.inference import MockInferenceEngine
+from unitxt.inference_engines import MockInferenceEngine
 from unitxt.text_utils import print_dict
 
 model_id = "meta-llama/llama-3-70b-instruct"
@@ -22,12 +22,12 @@
 """
 We are using a mock inference engine (and model) in order for the example to finish quickly.
 In real scenarios you can use model from Huggingface, OpenAi, and IBM, using the following:
-from unitxt.inference import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine)
+from unitxt.inference_engines import (HFPipelineBasedInferenceEngine, IbmGenAiInferenceEngine, OpenAiInferenceEngine)
 and switch them with the MockInferenceEngine class in the example.
 For the arguments these inference engines can receive, please refer to the classes documentation.
 
 Example of using an IBM model:
-from unitxt.inference import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin)
+from unitxt.inference_engines import (IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParamsMixin)
 params = IbmGenAiInferenceEngineParamsMixin(max_new_tokens=1024, random_seed=42)
 inference_model = IbmGenAiInferenceEngine(model_name=model_id, parameters=params)
 """

diff --git a/examples/evaluate_benchmark.py b/examples/evaluate_benchmark.py
@@ -1,6 +1,6 @@
 from unitxt.api import evaluate
 from unitxt.benchmark import Benchmark
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.standard import StandardRecipe

diff --git a/examples/evaluate_different_demo_selections.py b/examples/evaluate_different_demo_selections.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference_engines import IbmGenAiInferenceEngine
 from unitxt.splitters import CloseTextSampler, FixedIndicesSampler, RandomSampler
 from unitxt.text_utils import print_dict
 

diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference_engines import IbmGenAiInferenceEngine
 from unitxt.text_utils import print_dict
 
 logger = get_logger()

diff --git a/examples/evaluate_different_templates.py b/examples/evaluate_different_templates.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from unitxt import add_to_catalog, get_logger, register_local_catalog
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import IbmGenAiInferenceEngine
+from unitxt.inference_engines import IbmGenAiInferenceEngine
 from unitxt.templates import InputOutputTemplate
 from unitxt.text_utils import print_dict
 

diff --git a/examples/evaluate_existing_dataset_by_llm_as_judge.py b/examples/evaluate_existing_dataset_by_llm_as_judge.py
@@ -1,6 +1,6 @@
 from unitxt import get_logger, get_settings, load_dataset
 from unitxt.api import evaluate
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.text_utils import print_dict

diff --git a/examples/evaluate_existing_dataset_with_install.py b/examples/evaluate_existing_dataset_with_install.py
@@ -1,5 +1,5 @@
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.inference_engines import HFPipelineBasedInferenceEngine
 from unitxt.text_utils import print_dict
 
 # Use the Unitxt APIs to load the wnli entailment dataset using the standard template in the catalog for relation task with 2-shot in-context learning.
@@ -19,17 +19,17 @@
 #
 # change to this to infer with IbmGenAI APIs:
 #
-# from unitxt.inference import IbmGenAiInferenceEngine
+# from unitxt.inference_engines import IbmGenAiInferenceEngine
 # inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
 #
 # or this to infer using WML APIs:
 #
-# from unitxt.inference import WMLInferenceEngine
+# from unitxt.inference_engines import WMLInferenceEngine
 # inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=32)
 #
 # or to this to infer using OpenAI APIs:
 #
-# from unitxt.inference import OpenAiInferenceEngine
+# from unitxt.inference_engines import OpenAiInferenceEngine
 # inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
 #
 # Note that to run with OpenAI APIs you need to change the loader specification, to

diff --git a/examples/evaluate_image_text_to_text.py b/examples/evaluate_image_text_to_text.py
@@ -1,5 +1,5 @@
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import HFLlavaInferenceEngine
+from unitxt.inference_engines import HFLlavaInferenceEngine
 from unitxt.text_utils import print_dict
 
 inference_model = HFLlavaInferenceEngine(

diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py
@@ -3,7 +3,7 @@
     TaskCard,
 )
 from unitxt.collections_operators import Wrap
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.loaders import LoadFromDictionary

diff --git a/examples/evaluate_summarization_dataset_llm_as_judge.py b/examples/evaluate_summarization_dataset_llm_as_judge.py
@@ -1,6 +1,6 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.llm_as_judge import LLMAsJudge

diff --git a/examples/evaluate_using_metrics_ensemble.py b/examples/evaluate_using_metrics_ensemble.py
@@ -1,6 +1,6 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.metrics import MetricsEnsemble

diff --git a/examples/evaluate_with_log_probs.py b/examples/evaluate_with_log_probs.py
@@ -0,0 +1,25 @@
+from unitxt import infer
+from unitxt.inference_engines import (
+    HFLogProbScoringEngine,
+    SelectingByScoreInferenceEngine,
+)
+from unitxt.text_utils import print_dict
+
+dataset = infer(
+    [
+        {
+            "question": "What is the capital of Texas?",
+            "choices": ["Austin", "Tel Aviv"],
+        },
+        {"question": "What is the color of the sky?", "choices": ["Blue", "Red"]},
+    ],
+    engine=SelectingByScoreInferenceEngine(
+        scorer_engine=HFLogProbScoringEngine(model_name="gpt2", batch_size=1)
+    ),
+    task="tasks.qa.multiple_choice.open",
+    template="templates.qa.multiple_choice.title",
+    return_data=True,
+)
+
+for instance in dataset:
+    print_dict(instance, keys_to_print=["source", "prediction"])
diff --git a/examples/inference_using_ibm_watsonx_ai.py b/examples/inference_using_ibm_watsonx_ai.py
@@ -1,7 +1,7 @@
 import os
 
 from unitxt.api import load_dataset
-from unitxt.inference import WMLInferenceEngine
+from unitxt.inference_engines import WMLInferenceEngine
 from unitxt.text_utils import print_dict
 
 if __name__ == "__main__":

diff --git a/examples/qa_evaluation.py b/examples/qa_evaluation.py
@@ -2,7 +2,7 @@
 from unitxt.api import evaluate, load_dataset
 from unitxt.blocks import TaskCard
 from unitxt.collections_operators import Wrap
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.loaders import LoadFromDictionary

diff --git a/examples/standalone_evaluation_llm_as_judge.py b/examples/standalone_evaluation_llm_as_judge.py
@@ -1,7 +1,7 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
 from unitxt.blocks import Task, TaskCard
-from unitxt.inference import (
+from unitxt.inference_engines import (
     HFPipelineBasedInferenceEngine,
 )
 from unitxt.llm_as_judge import LLMAsJudge

diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py
@@ -1,7 +1,7 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
 from unitxt.blocks import Task, TaskCard
-from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.inference_engines import HFPipelineBasedInferenceEngine
 from unitxt.loaders import LoadFromDictionary
 from unitxt.templates import InputOutputTemplate, TemplatesDict
 from unitxt.text_utils import print_dict
@@ -54,17 +54,17 @@
 
 # change to this to infer with IbmGenAI APIs:
 #
-# from unitxt.inference import IbmGenAiInferenceEngine
+# from unitxt.inference_engines import IbmGenAiInferenceEngine
 # inference_model = IbmGenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
 #
 # or this to infer using WML APIs:
 #
-# from unitxt.inference import WMLInferenceEngine
+# from unitxt.inference_engines import WMLInferenceEngine
 # inference_model = WMLInferenceEngine(model_name=model_name, max_new_tokens=32)
 #
 # or to this to infer using OpenAI APIs:
 #
-# from unitxt.inference import OpenAiInferenceEngine
+# from unitxt.inference_engines import OpenAiInferenceEngine
 # inference_model = OpenAiInferenceEngine(model_name=model_name, max_new_tokens=32)
 #
 # Note that to run with OpenAI APIs you need to change the loader specification, to

diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py
@@ -7,7 +7,6 @@
     TaskCard,
 )
 from unitxt.catalog import add_to_catalog
-from unitxt.templates import TemplatesList
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -26,13 +25,8 @@
         Rename(field="caption", to_field="input_b"),
     ],
     task="tasks.generation.from_pair",
-    templates=TemplatesList(
-        [
-            "templates.generation.from_pair.default[postprocessors=[processors.lower_case]]"
-        ]
-    ),
-    __description__="NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a "
-    "paragraph of a table description with richer inference from scientific papers.",
+    templates="templates.generation.from_pair.all",
+    __description__="NumericNLG is a dataset for numerical table-to-text generation using pairs of a table and a paragraph of a table description with richer inference from scientific papers.",
     __tags__={
         "modality": "table",
         "urls": {"arxiv": "https://aclanthology.org/2021.acl-long.115/"},