Skip to content

Commit 8971fdd

Browse files
committed
⚠️[FIX] Fix /chat/completion with max_completion_tokens and stop parameters
1 parent d1a64d1 commit 8971fdd

File tree

4 files changed

+17
-7
lines changed

4 files changed

+17
-7
lines changed

src/fastmindapi/model/llama_cpp/LLM.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,10 @@ def generate(self,
6969

7070
return generation_output
7171

72-
def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, logprobs: bool = False, top_logprobs: int = 10):
72+
def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, logprobs: bool = False, top_logprobs: int = 10, stop: list[str] = None):
7373
response = self.model.create_chat_completion(messages,
7474
max_tokens=max_completion_tokens,
7575
logprobs=logprobs,
76-
top_logprobs=top_logprobs if logprobs else None)
76+
top_logprobs=top_logprobs if logprobs else None,
77+
stop=stop)
7778
return response

src/fastmindapi/model/openai/ChatModel.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,15 @@ def generate(self,
6161
"logits": logits_list}
6262
return generation_output
6363

64-
def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, logprobs: bool = False, top_logprobs: int =10):
64+
def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, logprobs: bool = False, top_logprobs: int =10, stop: list[str] = None):
6565
try:
6666
completion = self.client.chat.completions.create(
6767
model= self.model_name,
6868
messages=messages,
69-
max_completion_tokens=max_completion_tokens,
69+
max_tokens=max_completion_tokens,
7070
logprobs=logprobs,
7171
top_logprobs=top_logprobs if logprobs else None,
72+
stop=stop
7273
)
7374
return completion.model_dump()
7475
except Exception as e:

src/fastmindapi/model/transformers/CausalLM.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def generate(self,
105105

106106
return generation_output
107107

108-
def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, logprobs: bool = False, top_logprobs: int = 10):
108+
def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, logprobs: bool = False, top_logprobs: int = 10, stop: list[str] = None):
109109
import torch
110110
import time
111111

@@ -118,9 +118,10 @@ def chat(self, messages: list[ChatMessage], max_completion_tokens: int = None, l
118118
input_text += "assistant: "
119119

120120
inputs = self.tokenizer(input_text, return_tensors="pt").to(self.model.device)
121-
121+
122122
generate_kwargs = {
123123
"max_new_tokens": max_completion_tokens,
124+
"stop_strings": stop
124125
}
125126

126127
with torch.no_grad():

src/fastmindapi/server/router/openai.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,14 @@ class ChatMessage(BaseModel):
77
role: str
88
content: str
99

10+
1011
class ChatRequest(BaseModel):
1112
model: str
1213
messages: list[ChatMessage]
1314
max_completion_tokens: int = None
15+
logprobs: bool = False
16+
top_logprobs: int = 10
17+
stop: list[str] = None
1418

1519
model_config=ConfigDict(protected_namespaces=())
1620

@@ -24,7 +28,10 @@ def chat_completions(request: Request, item: ChatRequest):
2428

2529
outputs = server.module["model"].loaded_models[item.model].chat(
2630
messages=item.messages,
27-
max_completion_tokens=item.max_completion_tokens
31+
max_completion_tokens=item.max_completion_tokens,
32+
logprobs=item.logprobs,
33+
top_logprobs=item.top_logprobs,
34+
stop=item.stop
2835
)
2936
return outputs
3037

0 commit comments

Comments
 (0)