Skip to content

Commit

Permalink
🚀[NEW] Support vLLM.LLM ; small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fairyshine committed Oct 2, 2024
1 parent 4c816e5 commit 2338c6f
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 110 deletions.
4 changes: 2 additions & 2 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py#L1836

# OpenAI



https://platform.openai.com/docs/api-reference/chat/create

# vLLM

https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py#L312

# Model

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ curl http://127.0.0.1:8000/model/add_info \
-H "Authorization: Bearer sk-19992001" \
-d '{
"model_name": "gemma2",
"model_type": "TransformersCausalLM",
"model_type": "Transformers_CausalLM",
"model_path": ".../PTM/gemma-2-2b"
}'

Expand Down Expand Up @@ -129,7 +129,7 @@ client = FM.Client(IP="x.x.x.x", PORT=xxx, API_KEY="sk-19992001")
model_info_list = [
{
"model_name": "gemma2",
"model_type": "TransformersCausalLM",
"model_type": "Transformers_CausalLM",
"model_path": ".../PTM/gemma-2-2b"
},
]
Expand Down
Binary file modified asset/DevelopmentStatistics.xlsx
Binary file not shown.
1 change: 1 addition & 0 deletions src/fastmindapi/model/llama_cpp/LLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class LlamacppLLM:
def __init__(self,
model):
self.model = model
self.model_name = None

@classmethod
def from_path(cls,
Expand Down
1 change: 0 additions & 1 deletion src/fastmindapi/model/openai/ChatModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ def __init__(self,
self.client = client
self.system_prompt = system_prompt
self.model_name = model_name
pass

@classmethod
def from_client(cls,
Expand Down
4 changes: 3 additions & 1 deletion src/fastmindapi/model/transformers/CausalLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ def __init__(self,
model):
self.tokenizer = tokenizer
self.model = model
self.model_name = None

self.model.eval()
pass


@classmethod
def from_path(cls,
Expand Down
2 changes: 1 addition & 1 deletion src/fastmindapi/model/transformers/PeftModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def __init__(self, base_model: TransformersCausalLM,
self.raw_model = base_model.model
self.tokenizer = base_model.tokenizer
self.model = peft_model
pass
self.model_name = None

@classmethod
def from_path(cls, base_model: TransformersCausalLM,
Expand Down
193 changes: 90 additions & 103 deletions src/fastmindapi/model/vllm/LLM.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,104 +4,87 @@

class vLLMLLM:
def __init__(self,
tokenizer,
model):
self.tokenizer = tokenizer
self.model = model
# self.model.eval()
pass
self.tokenizer = self.model.get_tokenizer()
self.model_name = None

@classmethod
def from_path(cls,
model_path: str):
from vllm import LLM
return cls(AutoTokenizer.from_pretrained(model_path, trust_remote_code=True),
AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map="auto"))
return cls(LLM(model=model_path, trust_remote_code=True))

def __call__(self,
input_text: str,
max_new_tokens: int = None):
import torch
inputs = self.tokenizer(input_text, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
full_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
# output_text = full_text[len(input_text):]
re_inputs = self.tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
output_text = full_text[len(re_inputs):]
from vllm import SamplingParams
outputs = self.model.generate([input_text], SamplingParams(**({ "max_tokens": max_new_tokens } if max_new_tokens else {})))
output_text = outputs[0].outputs[0].text
return output_text

def generate(self,
input_text: str,
max_new_tokens: Optional[int] = None,
return_logits: Optional[bool] = None,
logits_top_k: Optional[int] = None,
stop_strings: Optional[list[str]] = None,
config: Optional[dict] = None):
import torch
import torch.nn.functional as F

inputs = self.tokenizer(input_text, return_tensors='pt').to(self.model.device) # shape: (1, sequence_length)
input_id_list = inputs.input_ids[0].tolist()
input_token_list = [self.tokenizer.decode([token_id]) for token_id in input_id_list]

with torch.no_grad():
generate_kwargs = {"generation_config": clean_dict_null_value(config) if config else None,
"max_new_tokens": max_new_tokens,
"stop_strings": stop_strings}
outputs = self.model.generate(inputs.input_ids,
**clean_dict_null_value(generate_kwargs),
tokenizer=self.tokenizer)
full_id_list = outputs[0].tolist()
full_token_list = [self.tokenizer.decode([token_id]) for token_id in full_id_list]
full_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
from vllm import SamplingParams
sampling_kwargs = {
"max_tokens": max_new_tokens,
"logprobs": logits_top_k if return_logits else None,
"prompt_logprobs": logits_top_k if return_logits else None,
"stop": stop_strings,
"repetition_penalty": (config["repetition_penalty"] if "repetition_penalty" in config else None) if config else None,
"temperature": (config["temperature"] if "temperature" in config else None) if config else None,
"top_p": (config["top_p"] if "top_p" in config else None) if config else None,
"top_k": (config["top_k"] if "top_k" in config else None) if config else None,
}
outputs = self.model.generate([input_text], SamplingParams(**clean_dict_null_value(sampling_kwargs)))

# output_text = full_text[len(input_text):]
re_inputs = self.tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
output_text = full_text[len(re_inputs):]
output_text = outputs[0].outputs[0].text
full_text = input_text + output_text

input_id_list = outputs[0].prompt_token_ids
output_id_list = list(outputs[0].outputs[0].token_ids)
full_id_list = input_id_list + output_id_list

full_token_list = [self.tokenizer.decode([token_id]) for token_id in full_id_list]
input_token_list = full_token_list[:len(input_id_list)]

logits_list = None
if return_logits:
# 获取模型的输出 logits
fulls = self.tokenizer(full_text, return_tensors='pt')
with torch.no_grad():
logits = self.model(**fulls).logits # shape: (batch_size, sequence_length, vocab_size)
probabilities = F.softmax(logits, dim=-1) # shape: (1, sequence_length, vocab_size)

# 使用 torch.topk 在 vocab_size 维度上获取 top_k 的 logits 和 token IDs
topk_logits, topk_tokens = torch.topk(logits, k=logits_top_k, dim=-1) # shape: (batch_size, sequence_length, top_k)
topk_probs, topk_tokens2 = torch.topk(probabilities, k=logits_top_k, dim=-1) # shape: (batch_size, sequence_length, top_k)
assert(torch.equal(topk_tokens,topk_tokens2))

# 提取 batch_size 和 sequence_length
_, sequence_length, _ = topk_tokens.shape
assert sequence_length == len(full_id_list)

# 遍历每个位置,打印 top_k 的 token 和 logits
import math
raw_input_logits_list = outputs[0].prompt_logprobs
raw_output_logits_list = outputs[0].outputs[0].logprobs
raw_logits_list = raw_input_logits_list + raw_output_logits_list

logits_list = [{"id": full_id_list[0], "token": full_token_list[0]}]
for i in range(sequence_length-1):
token_id = full_id_list[i+1]
token = full_token_list[i+1]
# print(f"Position {i} (Token: {repr(token)}):")
for i in range(1, len(full_id_list)):
token_id = full_id_list[i]
token = full_token_list[i]
raw_info_dict = raw_logits_list[i]
logits = {
"id": token_id,
"token": token,
"pred_id": [],
"pred_token": [],
"logits": [],
"probs": [],
# "logprobs": []
"pred_id": [None]*logits_top_k,
"pred_token": [None]*logits_top_k,
# "logits": [],
"probs": [None]*logits_top_k,
"logprobs": [None]*logits_top_k
}
for j in range(logits_top_k):
pred_token_id = topk_tokens[0, i, j].item()
pred_token = self.tokenizer.decode([pred_token_id])
logit = topk_logits[0, i, j].item()
prob = topk_probs[0, i, j].item()
# print(f" Top {j+1}: Token ID={pred_token_id}, Token={repr(pred_token)}, Logit={logit:.4f}, Prob={prob:.4%}")
logits["pred_id"].append(pred_token_id)
logits["pred_token"].append(pred_token)
logits["logits"].append(round(logit,4))
logits["probs"].append(round(prob,4))
for chosen_token_id in raw_info_dict:
raw_info = raw_info_dict[chosen_token_id]
rank = raw_info.rank
if rank <= logits_top_k:
logprob = raw_info.logprob
decoded_token = raw_info.decoded_token

logits["pred_id"][rank-1] = chosen_token_id
logits["pred_token"][rank-1] = decoded_token
logits["probs"][rank-1] = round(math.exp(logprob),4)
logits["logprobs"][rank-1] = round(logprob,4)
logits_list.append(logits)

generation_output = {"output_text": output_text,
Expand All @@ -111,7 +94,8 @@ def generate(self,
"full_id_list": full_id_list,
"full_token_list": full_token_list,
"full_text": full_text,
"logits": logits_list}
"logits": logits_list
}

return generation_output

Expand All @@ -121,55 +105,58 @@ def chat(self,
logprobs: Optional[bool] = None,
top_logprobs: Optional[int] = None,
stop: Optional[list[str]] = None):
import torch
import time

# 将消息列表转换为输入文本
input_text = ""
for message in messages:
role = message.role
content = message.content
input_text += f"{role}: {content}\n"
input_text += "assistant: "

inputs = self.tokenizer(input_text, return_tensors="pt").to(self.model.device)

generate_kwargs = {
"max_new_tokens": max_completion_tokens,
"stop_strings": stop
from vllm import SamplingParams
sampling_kwargs = {
"max_tokens": max_completion_tokens,
"logprobs": top_logprobs if logprobs else None,
"stop": stop,
}

with torch.no_grad():
outputs = self.model.generate(**inputs,
**clean_dict_null_value(generate_kwargs))

full_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
re_inputs = self.tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
output_text = full_text[len(re_inputs):]
# 对output_text进行后处理
if output_text.lower().startswith("assistant:"):
output_text = output_text[len("assistant:"):].strip()
outputs = self.model.chat(messages, SamplingParams(**clean_dict_null_value(sampling_kwargs)))

openai_logprobs = None
if logprobs:
openai_logprobs = []
for token_prob in outputs[0].outputs[0].logprobs:
probs = {
"token": token_prob[next(iter(token_prob))].decoded_token,
"logprob": token_prob[next(iter(token_prob))].logprob,
"top_logprobs": [None]*top_logprobs
}
for chosen_token_id in token_prob:
rank = token_prob[chosen_token_id].rank
if rank <= top_logprobs:
top_prob = {
"token": token_prob[chosen_token_id].decoded_token,
"logprob": token_prob[chosen_token_id].logprob
}
probs["top_logprobs"][rank-1] = top_prob
openai_logprobs.append(probs)

choices = []
choices.append({
"index": 0,
"message": {
"role": "assistant",
"content": output_text
"content": outputs[0].outputs[0].text
},
"finish_reason": "stop"
"logprobs": openai_logprobs,
"finish_reason": outputs[0].outputs[0].finish_reason
})

prompt_token_length = len(outputs[0].prompt_token_ids)
completion_token_length = len(list(outputs[0].outputs[0].token_ids))

response = {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": self.model.config.name_or_path,
"model": self.model_name,
"choices": choices,
"usage": {
"prompt_tokens": inputs.input_ids.shape[1],
"completion_tokens": sum(len(self.tokenizer.encode(text)) for text in output_text),
"total_tokens": inputs.input_ids.shape[1] + sum(len(self.tokenizer.encode(text)) for text in output_text)
"prompt_tokens": prompt_token_length,
"completion_tokens": completion_token_length,
"total_tokens": prompt_token_length + completion_token_length
}
}
return response

0 comments on commit 2338c6f

Please sign in to comment.