Make API-based model configurable (#344)

neubig · saum7800 · web-flow · commit 7223bf8544c6 · 2023-09-10T23:24:34.000+08:00
* Make models configurable

* Additional description on colab

* Revert unrelated change

* Remove unused argument

* Remove output from notebook

* Remove noqa

* Fix test

* Update prompt2model/utils/api_tools.py

Co-authored-by: Saumya Gandhi &lt;gandhisaumya8@gmail.com&gt;

* Update prompt2model/utils/api_tools.py

Co-authored-by: Saumya Gandhi &lt;gandhisaumya8@gmail.com&gt;

* Change test comments

* Default to maximum tokens for model

---------

Co-authored-by: Saumya Gandhi &lt;gandhisaumya8@gmail.com&gt;
diff --git a/prompt2model/dataset_generator/prompt_based.py b/prompt2model/dataset_generator/prompt_based.py
@@ -20,6 +20,7 @@
 from prompt2model.utils import (
     API_ERRORS,
     APIAgent,
+    api_tools,
     count_tokens_from_string,
     get_formatted_logger,
     handle_api_error,
@@ -415,7 +416,7 @@ def generate_dataset_split(
         generated_examples: list[Example] = []
 
         pbar = tqdm(total=num_examples, desc="Generating examples")
-        chat_api = APIAgent()
+        chat_api = api_tools.default_api_agent
 
         while len(generated_examples) < num_examples:
             if self.max_api_calls and self.api_call_counter >= self.max_api_calls:
diff --git a/prompt2model/model_retriever/generate_hypothetical_document.py b/prompt2model/model_retriever/generate_hypothetical_document.py
@@ -5,7 +5,7 @@
 import logging
 
 from prompt2model.prompt_parser import PromptSpec
-from prompt2model.utils import API_ERRORS, APIAgent, handle_api_error
+from prompt2model.utils import API_ERRORS, api_tools, handle_api_error
 
 PROMPT_PREFIX = '''HuggingFace contains models, which are each given a user-generated description. The first section of the description, delimited with two "---" lines, consists of a YAML description of the model. This may contain fields like "language" (supported by model), "datasets" (used to train the model), "tags" (e.g. tasks relevant to the model), and "metrics" (used to evaluate the model). Create a hypothetical HuggingFace model description that would satisfy a given user instruction. Here are some examples:
 
@@ -427,7 +427,7 @@ def generate_hypothetical_model_description(
     api_call_counter = 0
 
     instruction = prompt.instruction
-    api_agent = APIAgent("gpt-3.5-turbo-16k")
+    api_agent = api_tools.default_api_agent
     chatgpt_prompt = (
         PROMPT_PREFIX
         + "\n"
diff --git a/prompt2model/prompt_parser/instr_parser.py b/prompt2model/prompt_parser/instr_parser.py
@@ -13,7 +13,7 @@
     construct_prompt_for_instruction_parsing,
 )
 
-from prompt2model.utils import APIAgent, get_formatted_logger
+from prompt2model.utils import api_tools, get_formatted_logger
 from prompt2model.utils.api_tools import API_ERRORS, handle_api_error
 
 logger = get_formatted_logger("PromptParser")
@@ -61,7 +61,7 @@ def extract_response(self, response: openai.Completion) -> tuple[str, str] | Non
         try:
             response_json = json.loads(response_text, strict=False)
         except json.decoder.JSONDecodeError:
-            logger.warning("API response was not a valid JSON")
+            logger.warning(f"API response was not a valid JSON: {response_text}")
             return None
 
         required_keys = ["Instruction", "Demonstrations"]
@@ -85,7 +85,7 @@ def parse_from_prompt(self, prompt: str) -> None:
         """
         parsing_prompt_for_chatgpt = construct_prompt_for_instruction_parsing(prompt)
 
-        chat_api = APIAgent()
+        chat_api = api_tools.default_api_agent
         last_error = None
         while True:
             self.api_call_counter += 1
diff --git a/prompt2model/utils/api_tools.py b/prompt2model/utils/api_tools.py
@@ -8,6 +8,7 @@
 import time
 
 import aiolimiter
+import litellm.utils
 import openai
 import openai.error
 import tiktoken
@@ -40,13 +41,25 @@
 class APIAgent:
     """A class for accessing API-based models."""
 
-    def __init__(self, model_name: str = "gpt-3.5-turbo"):
-        """Initialize APIAgent with an API key.
+    def __init__(
+        self,
+        model_name: str = "gpt-3.5-turbo",
+        max_tokens: int | None = None,
+    ):
+        """Initialize APIAgent with model_name and max_tokens.
 
         Args:
             model_name: Name fo the model to use (by default, gpt-3.5-turbo).
+            max_tokens: The maximum number of tokens to generate. Defaults to the max
+                value for the model if available through litellm.
         """
         self.model_name = model_name
+        self.max_tokens = max_tokens
+        if max_tokens is None:
+            try:
+                self.max_tokens = litellm.utils.get_max_tokens(model_name)
+            except Exception:
+                pass
 
     def generate_one_completion(
         self,
@@ -73,6 +86,7 @@ def generate_one_completion(
             An OpenAI-like response object if there were no errors in generation.
             In case of API-specific error, Exception object is captured and returned.
         """
+        max_tokens = self.max_tokens or 4 * count_tokens_from_string(prompt)
         response = completion(  # completion gets the key from os.getenv
             model=self.model_name,
             messages=[
@@ -81,6 +95,7 @@ def generate_one_completion(
             temperature=temperature,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
+            max_tokens=max_tokens,
         )
         return response
 
@@ -154,14 +169,17 @@ async def _throttled_completion_acreate(
                         await asyncio.sleep(10)
                 return {"choices": [{"message": {"content": ""}}]}
 
+        max_tokens = self.max_tokens or 4 * max(
+            count_tokens_from_string(prompt) for prompt in prompts
+        )
         async_responses = [
             _throttled_completion_acreate(
-                model="gpt-3.5-turbo",
+                model=self.model_name,
                 messages=[
                     {"role": "user", "content": f"{prompt}"},
                 ],
                 temperature=temperature,
-                max_tokens=500,
+                max_tokens=max_tokens,
                 n=responses_per_request,
                 top_p=1,
                 limiter=limiter,
@@ -205,3 +223,8 @@ def count_tokens_from_string(string: str, encoding_name: str = "cl100k_base") ->
     encoding = tiktoken.get_encoding(encoding_name)
     num_tokens = len(encoding.encode(string))
     return num_tokens
+
+
+# This is the default API agent that is used everywhere if a different agent is not
+# specified
+default_api_agent = APIAgent()
diff --git a/prompt2model_demo.ipynb b/prompt2model_demo.ipynb
@@ -42,7 +42,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Set your OpenAI API key as an environment variable. A good way to do this is to create a `.env` file with a single line.\n",
+    "prompt2model requires that you use a base LLM to help out with various parts of the training process. The default is OpenAI's `gpt-3.5-turbo`, but you can use any model supported by [litellm](https://github.com/BerriAI/litellm). Set the appropriate API key as an environment variable. A good way to do this is to create a `.env` file with a single line, like below if you're using OpenAI.\n",
     "\n",
     "```text\n",
     "OPENAI_API_KEY=<your key here>\n",
@@ -81,6 +81,24 @@
     "os.environ['OPENAI_API_KEY'][:3]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we specify the base model that we want to use here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from prompt2model.utils import api_tools\n",
+    "# CHANGE THIS if you want to try a different model\n",
+    "api_tools.default_api_agent = api_tools.APIAgent(model_name=\"gpt-3.5-turbo\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/test_helpers/mock_api.py b/test_helpers/mock_api.py
@@ -1,6 +1,10 @@
 """Tools for mocking API responses (for testing purposes)."""
 
-from __future__ import annotations  # noqa FI58
+from __future__ import annotations
+
+import openai
+
+from prompt2model.utils.api_tools import APIAgent
 
 
 class MockCompletion:
@@ -177,6 +181,38 @@ def mock_batch_api_response_identical_completions(
     return mock_completions
 
 
+class MockAPIAgent(APIAgent):
+    """A mock API agent that always returns the same content."""
+
+    def __init__(self, default_content):
+        """Initialize the API agent."""
+        self.generate_one_call_counter = 0
+        self.generate_batch_call_counter = 0
+        self.default_content = default_content
+
+    def generate_one_completion(
+        self,
+        prompt: str,
+        temperature: float = 0,
+        presence_penalty: float = 0,
+        frequency_penalty: float = 0,
+    ) -> openai.Completion:
+        """Return a mocked object and increment the counter."""
+        self.generate_one_call_counter += 1
+        return MockCompletion(content=self.default_content)
+
+    async def generate_batch_completion(
+        self,
+        prompts: list[str],
+        temperature: float = 1,
+        responses_per_request: int = 5,
+        requests_per_minute: int = 80,
+    ) -> list[openai.Completion]:
+        """Return a mocked object and increment the counter."""
+        self.generate_batch_call_counter += 1
+        return [MockCompletion(content=self.default_content) for _ in prompts]
+
+
 class UnknownGpt3Exception(Exception):
     """This is a newly-defined exception for testing purposes."""
 
diff --git a/test_helpers/test_utils.py b/test_helpers/test_utils.py
@@ -0,0 +1,16 @@
+"""Utility functions for testing."""
+from contextlib import contextmanager
+
+
+@contextmanager
+def temp_setattr(obj, attr, value):
+    """Temporarily set an attribute on an object."""
+    original = getattr(obj, attr, None)
+    setattr(obj, attr, value)
+    try:
+        yield
+    finally:
+        if original is not None:
+            setattr(obj, attr, original)
+        else:
+            delattr(obj, attr)
diff --git a/tests/dataset_generator_test.py b/tests/dataset_generator_test.py
@@ -16,12 +16,14 @@
     PromptBasedDatasetGenerator,
 )
 from prompt2model.prompt_parser import MockPromptSpec, TaskType
+from prompt2model.utils import api_tools
 from test_helpers import (
     MockCompletion,
     UnknownGpt3Exception,
     mock_batch_api_response_identical_completions,
 )
-from test_helpers.mock_api import MockBatchDifferentCompletions
+from test_helpers.mock_api import MockAPIAgent, MockBatchDifferentCompletions
+from test_helpers.test_utils import temp_setattr
 
 logger = logging.getLogger("DatasetGenerator")
 
@@ -946,3 +948,25 @@ def test_dataset_generator_terminates(mocked_generate_example):
     generated_df = generated_dataset.to_pandas()
     assert len(generated_dataset) == 100
     assert list(generated_df.columns) == ["input_col", "output_col"]
+
+
+def test_generate_dataset_agent_switch():
+    """Test if dataset generation can use a user-set API agent."""
+    my_agent = MockAPIAgent(
+        default_content='{"input": "This is input.", "output": "This is an output."}'
+    )
+    with temp_setattr(api_tools, "default_api_agent", my_agent):
+        prompt_spec = MockPromptSpec(TaskType.CLASSIFICATION)
+        dataset_generator = PromptBasedDatasetGenerator(
+            initial_temperature=0.3,
+            max_temperature=1.4,
+            responses_per_request=1,
+            max_api_calls=100,
+            requests_per_minute=80,
+            filter_duplicated_examples=False,
+        )
+        dataset_generator.generate_dataset_split(
+            prompt_spec, 100, split=DatasetSplit.TRAIN
+        )
+    # 100 outputs, and each batch has 5 outputs so 20 api calls
+    assert my_agent.generate_batch_call_counter == 20
diff --git a/tests/model_retriever_test.py b/tests/model_retriever_test.py
@@ -11,8 +11,14 @@
 import torch
 
 from prompt2model.model_retriever import DescriptionModelRetriever
+from prompt2model.model_retriever.generate_hypothetical_document import (
+    generate_hypothetical_model_description,
+)
 from prompt2model.prompt_parser import MockPromptSpec, TaskType
+from prompt2model.utils import api_tools
 from test_helpers import create_test_search_index
+from test_helpers.mock_api import MockAPIAgent
+from test_helpers.test_utils import temp_setattr
 
 TINY_MODEL_NAME = "google/bert_uncased_L-2_H-128_A-2"
 
@@ -238,3 +244,12 @@ def test_retrieve_bm25_when_no_index_exists():
     assert top_model_names[0] == "t5-base"
     # Clear search index from disk.
     shutil.rmtree(retriever.search_index_path)
+
+
+def test_generate_hypothetical_document_agent_switch():
+    """Test if generate_hypothetical_document can use a user-set API agent."""
+    my_agent = MockAPIAgent(default_content="test response")
+    with temp_setattr(api_tools, "default_api_agent", my_agent):
+        prompt_spec = MockPromptSpec(TaskType.CLASSIFICATION)
+        generate_hypothetical_model_description(prompt_spec, max_api_calls=3)
+    assert my_agent.generate_one_call_counter == 1
diff --git a/tests/prompt_parser_test.py b/tests/prompt_parser_test.py
@@ -8,7 +8,11 @@
 import pytest
 
 from prompt2model.prompt_parser import PromptBasedInstructionParser, TaskType
+from prompt2model.prompt_parser.mock import MockPromptSpec
+from prompt2model.utils import api_tools
 from test_helpers import MockCompletion, UnknownGpt3Exception
+from test_helpers.mock_api import MockAPIAgent
+from test_helpers.test_utils import temp_setattr
 
 logger = logging.getLogger("PromptParser")
 GPT3_RESPONSE_WITH_DEMONSTRATIONS = MockCompletion(
@@ -116,7 +120,13 @@ def test_instruction_parser_with_invalid_json(mocked_parsing_method):
             prompt_spec.parse_from_prompt(prompt)
         mock_info.assert_not_called()
         warning_list = [each.args[0] for each in mock_warning.call_args_list]
-        assert warning_list == ["API response was not a valid JSON"] * 3
+        assert (
+            warning_list
+            == [
+                'API response was not a valid JSON: {"Instruction": "A", "Demonstrations": "B}'  # noqa: E501
+            ]
+            * 3
+        )  # noqa: E501
     assert mocked_parsing_method.call_count == 3
     assert prompt_spec._instruction is None
     assert prompt_spec._examples is None
@@ -179,3 +189,17 @@ def test_instruction_parser_with_unexpected_error(mocked_parsing_method):
     # Check that we only tried calling the API once.
     assert mocked_parsing_method.call_count == 1
     gc.collect()
+
+
+def test_prompt_parser_agent_switch():
+    """Test if prompt parser can use a user-set API agent."""
+    my_agent = MockAPIAgent(
+        default_content='{"Instruction": "test response", "Demonstrations": "test response"}'  # noqa: E501
+    )
+    with temp_setattr(api_tools, "default_api_agent", my_agent):
+        prompt_parser = PromptBasedInstructionParser(
+            TaskType.CLASSIFICATION, max_api_calls=3
+        )
+        prompt_spec = MockPromptSpec(TaskType.CLASSIFICATION)
+        prompt_parser.parse_from_prompt(prompt_spec)
+    assert my_agent.generate_one_call_counter == 1