Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add openai client to deepspeedometer #913

Merged
merged 5 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
from .dummy_client import DummyClientConfig, DummyClient
from .fastgen_client import FastGenClientConfig, FastGenClient
from .vllm_client import vLLMClientConfig, vLLMClient
from .openai_client import openaiClientConfig, openaiClient

client_config_classes = {
"dummy": DummyClientConfig,
"azure_ml": AzureMLClientConfig,
"fastgen": FastGenClientConfig,
"vllm": vLLMClientConfig,
"openai": openaiClientConfig
}
client_classes = {
"dummy": DummyClient,
"azure_ml": AzureMLClient,
"fastgen": FastGenClient,
"vllm": vLLMClient,
"openai": openaiClient,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import json
import requests
import subprocess
import time
from typing import Any, Dict

from loguru import logger
from pydantic import Field

from .base import BaseClient
from ..config import BaseConfigModel
from ..prompt import Prompt


# client to test any openai API
class openaiClientConfig(BaseConfigModel):
model: str = Field(..., description="HuggingFace.co model name")
url: str = "http://127.0.0.1:26500/v1/completions"


class openaiClient(BaseClient):
def __init__(self, config: openaiClientConfig):
super().__init__(config)

def start_service(self) -> None:
pass

def stop_service(self) -> None:
pass

def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
api_url = self.config.url
headers = {
"User-Agent": "Benchmark Client",
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
pload = {
"prompt": prompt.text,
"model": self.config.model,
"n": 1,
"use_beam_search": False,
"temperature": 1.0,
"top_p": 0.9,
"max_tokens": prompt.max_new_tokens,
"ignore_eos": False,
}
return {"url": api_url, "headers": headers, "json": pload, "timeout": 180}

def send_request(self, request_kwargs: Dict[str, Any]) -> Any:
response = requests.post(**request_kwargs)
output = json.loads(response.content)
return output

def process_response(self, raw_response: Any) -> str:
return raw_response["choices"][0]["text"]
76 changes: 75 additions & 1 deletion benchmarks/inference/mii/src/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,80 @@ def get_response(response: requests.Response) -> List[str]:
)


# client talks with openai api
def call_openai(
input_tokens: str, max_new_tokens: int, args: argparse.Namespace
) -> ResponseDetails:

api_url = args.openai_api_url
headers = {
"User-Agent": "Benchmark Client",
"Content-Type": "application/json",
"Authorization": f"Bearer {args.openai_api_key}"
}

pload = {
"prompt": input_tokens,
"model": args.model,
"n": 1,
"use_beam_search": False,
"temperature": 1.0,
"top_p": 0.9,
"max_tokens": max_new_tokens,
"ignore_eos": False,
"stream": args.stream,
}

def clear_line(n: int = 1) -> None:
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)

def get_streaming_response(
response: requests.Response, time_last_token
) -> Iterable[List[str]]:
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"data:"
):
if chunk:
plain=chunk.decode("utf-8")
if plain.strip() == "[DONE]":
continue
data = json.loads(plain)
output = data["choices"][0]["text"]
time_now = time.time()
yield output, time_now - time_last_token
time_last_token = time_now

# For non-streaming, but currently non-streaming is not fully implemented
def get_response(response: requests.Response) -> List[str]:
data = json.loads(response.content)
output = data["choices"][0]["text"]
return output

token_gen_time = []
start_time = time.time()
#response = requests.post(api_url, headers=headers, json=pload, stream=False)
response = requests.post(api_url, headers=headers, json=pload, stream=args.stream)
if args.stream:
output = ""
for h, t in get_streaming_response(response, start_time):
output += h
token_gen_time.append(t)
else:
output = get_response(response)

return ResponseDetails(
generated_tokens=output,
prompt=input_tokens,
start_time=start_time,
end_time=time.time(),
model_time=0,
token_gen_time=token_gen_time,
)


def call_aml(
input_tokens: str,
max_new_tokens: int,
Expand Down Expand Up @@ -205,7 +279,7 @@ def _run_parallel(
event_loop = asyncio.new_event_loop()
asyncio.set_event_loop(event_loop)

backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml}
backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml, "openai": call_openai}
call_fn = backend_call_fns[args.backend]

barrier.wait()
Expand Down
9 changes: 6 additions & 3 deletions benchmarks/inference/mii/src/plot_effective_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \
parser.add_argument("--backend", type=str, choices=["fastgen", "vllm", "openai"], default=["fastgen", "vllm"], \
nargs="+", help="Specify the backends to generate plots for")
parser.add_argument("--log_dir", type=Path, default="./results")
parser.add_argument("--model", type=str)
parser.add_argument("--out_dir", type=Path, default="./plots/goodtput")
parser.add_argument("--sla_prompt_tokens_per_sec", type=int, default=512, help="SLA prompt tokens per second")
parser.add_argument("--sla_gen_tokens_per_sec", type=int, default=[1, 2, 3, 4, 6, 8], nargs="+", help="SLA generation tokens/s targets")
Expand Down Expand Up @@ -76,7 +77,7 @@ def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span):


def validate_prompt_latency_SLA(response_detail, sla_token_gen, f, sla_prompt_tokens_per_sec ):
tokenizer = get_tokenizer()
tokenizer = get_tokenizer(args.model)
prompt_length = len(tokenizer.tokenize(response_detail.prompt))
prompt_latency_SLA = prompt_length / sla_prompt_tokens_per_sec
if prompt_latency_SLA < response_detail.token_gen_time[0]:
Expand Down Expand Up @@ -137,7 +138,9 @@ def output_charts(args, model, tp_size, bs, replicas, sla_token_gen, prompt, gen
]

plt_cfg = {'vllm': {'label': 'vLLM', 'marker': 'x', 'color': 'orange'},\
'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}}
'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}, \
'openai': {'label': 'openai-API', 'marker': '+', 'color': 'red'}
}

for f in validate_funcs:
plt.figure()
Expand Down
11 changes: 7 additions & 4 deletions benchmarks/inference/mii/src/postprocess_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,13 @@ def parse_args():
return args


def get_tokenizer():
def get_tokenizer(model=None):
global tokenizer
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
if model==None:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
else:
tokenizer = AutoTokenizer.from_pretrained(model)
return tokenizer


Expand All @@ -78,8 +81,8 @@ def get_summary(args, response_details):

tokens_per_sec = mean(
[
(len(get_tokenizer().tokenize(r.prompt)) +
len(get_tokenizer().tokenize(r.generated_tokens)) if type(r.generated_tokens) == str
(len(get_tokenizer(args["model"]).tokenize(r.prompt)) +
len(get_tokenizer(args["model"]).tokenize(r.generated_tokens)) if type(r.generated_tokens) == str
else len(r.generated_tokens))
/ (r.end_time - r.start_time)
for r in response_details
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/inference/mii/src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def start_server(args: argparse.Namespace) -> None:
"fastgen": start_fastgen_server,
"vllm": start_vllm_server,
"aml": start_aml_server,
"openai": start_openai_server,
}
start_fn = start_server_fns[args.backend]
start_fn(args)
Expand Down Expand Up @@ -90,12 +91,16 @@ def start_aml_server(args: argparse.Namespace) -> None:
"AML server start not implemented. Please use Azure Portal to start the server."
)

def start_openai_server(args: argparse.Namespace) -> None:
# openai api has no command to stop server
pass

def stop_server(args: argparse.Namespace) -> None:
stop_server_fns = {
"fastgen": stop_fastgen_server,
"vllm": stop_vllm_server,
"aml": stop_aml_server,
"openai": stop_openai_server,
}
stop_fn = stop_server_fns[args.backend]
stop_fn(args)
Expand All @@ -118,6 +123,9 @@ def stop_aml_server(args: argparse.Namespace) -> None:
"AML server stop not implemented. Please use Azure Portal to stop the server."
)

def stop_openai_server(args: argparse.Namespace) -> None:
# openai api has no command to stop server
pass

if __name__ == "__main__":
args = parse_args(server_args=True)
Expand Down
14 changes: 13 additions & 1 deletion benchmarks/inference/mii/src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ def parse_args(
default="./results/",
help="Directory to save result JSON files",
)
client_parser.add_argument(
"--openai_api_url",
type=str,
default=None,
help="When using the openai API backend, this is the API URL that points to an openai api server",
)
client_parser.add_argument(
"--openai_api_key",
type=str,
default=None,
help="When using the openai API backend, this is the API key for a given openai_api_url",
)
client_parser.add_argument(
"--aml_api_url",
type=str,
Expand Down Expand Up @@ -156,7 +168,7 @@ def parse_args(
parser.add_argument(
"--backend",
type=str,
choices=["aml", "fastgen", "vllm"],
choices=["aml", "fastgen", "vllm", "openai"],
default="fastgen",
help="Which backend to benchmark",
)
Expand Down
Loading