@@ -72,6 +72,7 @@ def generate_one_completion(
72
72
temperature : float = 0 ,
73
73
presence_penalty : float = 0 ,
74
74
frequency_penalty : float = 0 ,
75
+ token_buffer : int = 300 ,
75
76
) -> openai .Completion :
76
77
"""Generate a chat completion using an API-based model.
77
78
@@ -86,16 +87,21 @@ def generate_one_completion(
86
87
frequency_penalty: Float between -2.0 and 2.0. Positive values penalize new
87
88
tokens based on their existing frequency in the text so far, decreasing
88
89
the model's likelihood of repeating the same line verbatim.
90
+ token_buffer: Number of tokens below the LLM's limit to generate. In case
91
+ our tokenizer does not exactly match the LLM API service's perceived
92
+ number of tokens, this prevents service errors. On the other hand, this
93
+ may lead to generating fewer tokens in the completion than is actually
94
+ possible.
89
95
90
96
Returns:
91
97
An OpenAI-like response object if there were no errors in generation.
92
98
In case of API-specific error, Exception object is captured and returned.
93
99
"""
94
100
num_prompt_tokens = count_tokens_from_string (prompt )
95
101
if self .max_tokens :
96
- max_tokens = self .max_tokens - num_prompt_tokens
102
+ max_tokens = self .max_tokens - num_prompt_tokens - token_buffer
97
103
else :
98
- max_tokens = 4 * num_prompt_tokens
104
+ max_tokens = 3 * num_prompt_tokens
99
105
100
106
response = completion ( # completion gets the key from os.getenv
101
107
model = self .model_name ,
@@ -116,6 +122,7 @@ async def generate_batch_completion(
116
122
temperature : float = 1 ,
117
123
responses_per_request : int = 5 ,
118
124
requests_per_minute : int = 80 ,
125
+ token_buffer : int = 300 ,
119
126
) -> list [openai .Completion ]:
120
127
"""Generate a batch responses from OpenAI Chat Completion API.
121
128
@@ -126,6 +133,11 @@ async def generate_batch_completion(
126
133
responses_per_request: Number of responses for each request.
127
134
i.e. the parameter n of API call.
128
135
requests_per_minute: Number of requests per minute to allow.
136
+ token_buffer: Number of tokens below the LLM's limit to generate. In case
137
+ our tokenizer does not exactly match the LLM API service's perceived
138
+ number of tokens, this prevents service errors. On the other hand, this
139
+ may lead to generating fewer tokens in the completion than is actually
140
+ possible.
129
141
130
142
Returns:
131
143
List of generated responses.
@@ -183,9 +195,9 @@ async def _throttled_completion_acreate(
183
195
184
196
num_prompt_tokens = max (count_tokens_from_string (prompt ) for prompt in prompts )
185
197
if self .max_tokens :
186
- max_tokens = self .max_tokens - num_prompt_tokens
198
+ max_tokens = self .max_tokens - num_prompt_tokens - token_buffer
187
199
else :
188
- max_tokens = 4 * num_prompt_tokens
200
+ max_tokens = 3 * num_prompt_tokens
189
201
190
202
async_responses = [
191
203
_throttled_completion_acreate (
0 commit comments