Skip to content

Commit

Permalink
Refactor api keys and naming (#331)
Browse files Browse the repository at this point in the history
* Refactor usage of API keys

* More refactoring

* Add missing files

* Fix wording

* Redo prompt template

* Add "if using OpenAI"

* Update file

* Remove noqa

* Added comment
  • Loading branch information
neubig authored Sep 7, 2023
1 parent e368960 commit a21ef8d
Show file tree
Hide file tree
Showing 23 changed files with 341 additions and 558 deletions.
11 changes: 3 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,9 @@ pip install prompt2model

`Prompt2Model` supports various platforms such as OpenAI, Anthropic, Huggingface, etc. using [LiteLLM](https://github.com/BerriAI/litellm).

To use OpenAI, please follow these

- Sign up on the OpenAI website and obtain an
OpenAI API key.

- Set
the environment variable
`OPENAI_API_KEY` to your API key by running
If you are using OpenAI models (such as the default `gpt-3.5-turbo`), please obtain an
OpenAI API key on their [website](https://platform.openai.com/) then set
the environment variable `OPENAI_API_KEY` to your API key by running
the following command in your terminal:

```bash
Expand Down
4 changes: 2 additions & 2 deletions prompt2model/dataset_generator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
"""Import DatasetGenerator classes."""
from prompt2model.dataset_generator.base import DatasetGenerator, DatasetSplit
from prompt2model.dataset_generator.mock import MockDatasetGenerator
from prompt2model.dataset_generator.openai_gpt import OpenAIDatasetGenerator
from prompt2model.dataset_generator.prompt_based import PromptBasedDatasetGenerator

__all__ = (
"OpenAIDatasetGenerator",
"PromptBasedDatasetGenerator",
"MockDatasetGenerator",
"DatasetGenerator",
"DatasetSplit",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
"""A simple dataset generator that uses OpenAI's GPT-3.5 API."""
"""A simple dataset generator that uses APIs."""

from __future__ import annotations # noqa FI58
from __future__ import annotations

import asyncio
import json
import math
import os
import random
from collections import Counter, defaultdict
from dataclasses import dataclass
Expand All @@ -17,14 +16,14 @@
from tqdm import tqdm

from prompt2model.dataset_generator.base import DatasetGenerator, DatasetSplit
from prompt2model.dataset_generator.openai_gpt_template import construct_meta_prompt
from prompt2model.dataset_generator.prompt_template import construct_meta_prompt
from prompt2model.prompt_parser import PromptSpec
from prompt2model.utils import (
OPENAI_ERRORS,
ChatGPTAgent,
API_ERRORS,
APIAgent,
count_tokens_from_string,
get_formatted_logger,
handle_openai_error,
handle_api_error,
)

nest_asyncio.apply()
Expand All @@ -39,12 +38,11 @@ class Example:
output_col: str


class OpenAIDatasetGenerator(DatasetGenerator):
"""A abstract class for NLP dataset generation using OpenAI's GPT-3.5 API."""
class PromptBasedDatasetGenerator(DatasetGenerator):
"""A abstract class for NLP dataset generation using a prompted API."""

def __init__(
self,
api_key: str | None = None,
max_api_calls: int = None,
initial_temperature: float = 0.5,
max_temperature: float = 1.7,
Expand All @@ -56,11 +54,9 @@ def __init__(
filter_duplicated_examples: bool = True,
cache_root: str = "cached_generated_dataset",
):
"""Initializes an instance of the OpenAI DatasetGenerator.
"""Initializes an instance of the PromptBasedDatasetGenerator.
Args:
api_key: A valid OpenAI API key. If not provided, the environment
variable OPENAI_API_KEY is used.
max_api_calls: The maximum number of API calls allowed,
or None for unlimited.
initial_temperature: The sampling temperature to use when initializing
Expand All @@ -81,29 +77,22 @@ def __init__(
cache_root: The root directory for caching generated examples.
Raises:
ValueError: If an API key is not provided and set as an environment
variable, or if the 'max_api_calls' value is not greater than 0.
ValueError: If the 'max_api_calls' value is not greater than 0.
Note:
For the OpenAI GPT-3.5 API, Temperature ranges from 0 to 2. Higher
Temperature ranges from 0 to 2. Higher
values yield more random/diverse outputs with lower quality, while
lower values produce more deterministic outputs with higher quality.
We use a strategy to dynamically adjust the temperature from
initial_temperature to max_temperature during generation.
We incorporate random few-shot generated examples into the prompt
to the OpenAI GPT-3.5 API. The initial temperature is set lower to obtain
We incorporate random few-shot generated examples into the prompt.
The initial temperature is set lower to obtain
high-quality, low-diversity examples. As the number of generated examples
increases, we gradually have more high-quality examples for in-context
learning during generation. This allows us to achieve high-quality,
high-diversity examples later on by using a higher temperature.
"""
self.api_key: str | None = api_key if api_key else self.validate_environment()
if self.api_key is None or self.api_key == "":
raise ValueError(
"API key must be provided or set the environment variable "
"e.g. `export OPENAI_API_KEY=<your key>`."
)
if max_api_calls and max_api_calls <= 0:
raise ValueError("max_api_calls must be > 0")
self.max_api_calls = max_api_calls
Expand All @@ -130,35 +119,6 @@ def __init__(
self.filter_duplicated_examples = filter_duplicated_examples
self.cache_root = Path(cache_root)

def validate_environment(self):
"""Check if any of the required API keys are present in the environment.
Returns:
str or None: The API key value if found in the environment, else None.
"""
api_key = None
if "OPENAI_API_KEY" in os.environ:
api_key = os.getenv("OPENAI_API_KEY")
elif "ANTHROPIC_API_KEY" in os.environ:
api_key = os.getenv("ANTHROPIC_API_KEY")
elif "REPLICATE_API_KEY" in os.environ:
api_key = os.getenv("REPLICATE_API_KEY")
elif "AZURE_API_KEY" in os.environ:
api_key = os.getenv("AZURE_API_KEY")
elif "COHERE_API_KEY" in os.getenv("COHERE_API_KEY"):
api_key = os.getenv("COHERE_API_KEY")
elif "TOGETHERAI_API_KEY" in os.environ:
api_key = os.getenv("TOGETHERAI_API_KEY")
elif "BASETEN_API_KEY" in os.environ:
api_key = os.getenv("BASETEN_API_KEY")
elif "AI21_API_KEY" in os.environ:
api_key = os.getenv("AI21_API_KEY")
elif "OPENROUTER_API_KEY" in os.environ:
api_key = os.getenv("OPENROUTER_API_KEY")
elif "ALEPHALPHA_API_KEY" in os.environ:
api_key = os.getenv("ALEPHALPHA_API_KEY")
return api_key

def construct_prompt(
self,
instruction: str,
Expand Down Expand Up @@ -273,7 +233,7 @@ def construct_input_output_map(
symbol of gold?”, the outputs might be “Au”, “Au”, and “AU”, where the
last one is wrong due to capital letters.
To address this, OpenAIDataSetGenerator uses a two-step multi-vote
To address this, PromptBasedDatasetGenerator uses a two-step multi-vote
filtering mechanism. This function represents the first step, creating a
dictionary to map inputs to a `Counter` of their outputs.
Expand Down Expand Up @@ -442,7 +402,7 @@ def create_all_examples_dataset_and_generated_dataset(
def compute_batch_size(
self, expected_num_examples: int, generated_dataset: Dataset
) -> int:
"""Computes the batch size for OpenAI API calls in a batch.
"""Computes the batch size for API calls in a batch.
The batch size is determined based on the remaining number of examples to be
generated and the number of responses per request. The function also respects
Expand All @@ -456,7 +416,7 @@ def compute_batch_size(
generated_dataset: Currently generated dataset.
Returns:
The batch size for the next batch of OpenAI API calls with zeno-build.
The batch size for the next batch of API calls with zeno-build.
"""
# If max_api_calls is not set, make it equivalent to the batch size
max_api_calls = (
Expand All @@ -482,10 +442,10 @@ def compute_batch_size(
def extract_responses(
self, completions: list[openai.Completion], generated_examples: list[Example]
) -> list[Example]:
"""Extracts the generated sample and annotation from an OpenAI API response.
"""Extracts the generated sample and annotation from an API response.
Args:
completions: A list of Completion objects returned by the OpenAI API.
completions: A list of Completion objects returned by the API.
Each API call returns a number of completion objects equivalent to
`responses_per_request`. The default `responses_per_request` = 5.
generated_examples: Currently generated examples of DatasetGenerator.
Expand Down Expand Up @@ -576,15 +536,15 @@ def extract_responses(

async def generate_responses(
self,
chat_api: ChatGPTAgent,
chat_api: APIAgent,
generated_dataset: Dataset,
expected_num_examples: int,
prompts: list[str],
) -> list[openai.Completion]:
"""Asynchronously generates responses using the GPT-3.5 API.
Args:
chat_api: ChatGPTAgent to generate responses.
chat_api: APIAgent to generate responses.
generated_dataset: Currently generated dataset.
expected_num_examples: The number of examples expected
to be generated.
Expand All @@ -609,7 +569,7 @@ async def generate_responses(

# Ensure the dynamic temperature is within the range [0, 2.0]
clipped_temperature = max(0.0, min(2.0, dynamic_temperature))
responses = await chat_api.generate_batch_openai_chat_completion(
responses = await chat_api.generate_batch_completion(
prompts,
temperature=clipped_temperature,
responses_per_request=self.responses_per_request,
Expand Down Expand Up @@ -664,7 +624,7 @@ def generate_dataset_split(
generated_examples = []

pbar = tqdm(total=expected_num_examples, desc="Generating examples")
chat_api = ChatGPTAgent(self.api_key)
chat_api = APIAgent()

while True:
# Each API call will return `responses_per_request` completion
Expand Down Expand Up @@ -724,7 +684,7 @@ def generate_dataset_split(
generated_examples = self.extract_responses(
responses, generated_examples
)
except OPENAI_ERRORS as e:
# Handle OpenAI API errors and adjust the API call counter.
self.api_call_counter = handle_openai_error(e, self.api_call_counter)
except API_ERRORS as e:
# Handle API errors and adjust the API call counter.
self.api_call_counter = handle_api_error(e, self.api_call_counter)
return generated_dataset
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
# Prompt_template must contains `instruction` and `examples` fields.
# The COMPLEX_PROMPT_TEMPLATE is used when random_example_num < 5.
# The SIMPLE_PROMPT_TEMPLATE is used when random_example_num >= 5.
# To save the price of calling OPENAI's API.
# To save the price of making API calls.

META_PROMPT = """
As a DatasetGenerator, your task is to generate a new example (`input` and `output`) based on the [new instruction] and [few-shot examples]. Please provide a JSON dictionary response that includes the new `input` and its corresponding `output`. Use the `input` and `output` keys in the dictionary. The 'input' field should be marked as 'N/A' if the instruction doesn't require additional input.
Expand Down
17 changes: 6 additions & 11 deletions prompt2model/dataset_generator/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,21 @@
- `DatasetGenerator`: An abstract class to generate datasets.
- `DatasetSplit`: An enumeration class defining dataset types (`TRAIN`,
`VALIDATION`, `TEST`).
- `OpenAIDatasetGenerator`: A concrete class
- `PromptBasedDatasetGenerator`: A concrete class
for dataset generation using GPT-3.5 API.

## Getting Started

- **Import the Modules**:

```python
from prompt2model.dataset_generator import OpenAIDatasetGenerator, DatasetSplit
from prompt2model.prompt_parser import OpenAIInstructionParser, TaskType
from prompt2model.dataset_generator import PromptBasedDatasetGenerator, DatasetSplit
from prompt2model.prompt_parser import PromptBasedInstructionParser, TaskType
```

- **Setup OpenAI API Key**:
- **Setup API Key**:

```python
api_key = "<your-api-key>"
dataset_generator = OpenAIDatasetGenerator(api_key)
```

Or, set as an environment variable:
Set an API key as an environment variable. For instance, if using OpenAI:

```bash
export OPENAI_API_KEY="<your-api-key>"
Expand All @@ -33,7 +28,7 @@ export OPENAI_API_KEY="<your-api-key>"
- **Parse the Prompt**:

```python
prompt_spec = OpenAIInstructionParser(task_type=TaskType.<task_type>)
prompt_spec = PromptBasedInstructionParser(task_type=TaskType.<task_type>)
# Refer the document string of DatasetSplit for more details.
prompt = "<your-prompt>"
prompt_spec.parse_from_prompt(prompt)
Expand Down
6 changes: 3 additions & 3 deletions prompt2model/demo_creator/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@

from prompt2model.dataset_processor import TextualizeProcessor
from prompt2model.model_executor import GenerationModelExecutor
from prompt2model.prompt_parser import OpenAIInstructionParser
from prompt2model.prompt_parser import PromptBasedInstructionParser


def create_gradio(
model_executor: GenerationModelExecutor, prompt_parser: OpenAIInstructionParser
model_executor: GenerationModelExecutor, prompt_parser: PromptBasedInstructionParser
) -> gr.Blocks:
"""Create a Gradio interface automatically.
Args:
model_executor: A GenerationModelExecutor to expose via a Gradio interface.
prompt_parser: An instance of OpenAIInstructionParser to parse the prompt.
prompt_parser: An instance of PromptBasedInstructionParser to parse the prompt.
Returns:
A Gradio interface for interacting with the model.
Expand Down
4 changes: 2 additions & 2 deletions prompt2model/demo_creator/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ interface for model interactions.

```python
from prompt2model.model_executor import GenerationModelExecutor
from prompt2model.prompt_parser import OpenAIInstructionParser
from prompt2model.prompt_parser import PromptBasedInstructionParser
from prompt2model.gradio_interface import create_gradio
```

- **Initialize Components**:

```python
model_executor = GenerationModelExecutor(...)
prompt_parser = OpenAIInstructionParser(...)
prompt_parser = PromptBasedInstructionParser(...)
# Refer to the documentation of ModelExecutor and PromptParser for details.
```

Expand Down
8 changes: 1 addition & 7 deletions prompt2model/model_retriever/description_based_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def __init__(
use_bm25: bool = True,
bm25_index_name: str = "model-index",
use_HyDE: bool = False,
openai_api_key: str | None = None,
):
"""Initialize a dual-encoder retriever against a search index.
Expand All @@ -83,8 +82,6 @@ def __init__(
bm25_index_name: The name used to save the search index for BM25.
use_HyDE: Whether to use HyDE to replace the query with a hypothetical
model description generated by an LLM.
openai_api_key: OpenAI API key. If None, default to using the
OPENAI_API_KEY environment variable.
"""
self.search_depth = search_depth
self.first_stage_depth = first_stage_depth
Expand All @@ -96,7 +93,6 @@ def __init__(
# Otherwise, we use a dual-encoder retriever.
self.use_bm25 = use_bm25
self.use_HyDE = use_HyDE
self.openai_api_key = openai_api_key

# Blocklist certain models' organizations to exclude from model retrieval
# search results; certain organizations programmatically create models which
Expand Down Expand Up @@ -247,9 +243,7 @@ def retrieve(
self.encode_model_descriptions(self._search_index_path)

if self.use_HyDE:
query_text = generate_hypothetical_model_description(
prompt, self.openai_api_key
)
query_text = generate_hypothetical_model_description(prompt)
else:
query_text = prompt.instruction

Expand Down
Loading

0 comments on commit a21ef8d

Please sign in to comment.