From ef8a3603696928a07991f9bb1d7dedc2f2b2a9c7 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Fri, 12 Jan 2024 17:56:53 -0800
Subject: [PATCH 01/14] make mii benchmark support multiple models, some
 refactoring

---
 benchmarks/inference/mii/__init__.py          |   0
 .../{run_benchmark_client.py => client.py}    | 199 +++++++++---------
 .../inference/mii/postprocess_results.py      |  68 ++++--
 benchmarks/inference/mii/run_all.sh           |  27 +--
 benchmarks/inference/mii/run_all_vllm.sh      |  26 ---
 benchmarks/inference/mii/run_benchmark.py     |  96 +++++++++
 .../inference/mii/run_benchmark_client.sh     |  23 --
 benchmarks/inference/mii/server.py            | 156 ++++++++------
 benchmarks/inference/mii/utils.py             | 131 ++++++++++++
 9 files changed, 472 insertions(+), 254 deletions(-)
 create mode 100644 benchmarks/inference/mii/__init__.py
 rename benchmarks/inference/mii/{run_benchmark_client.py => client.py} (53%)
 delete mode 100644 benchmarks/inference/mii/run_all_vllm.sh
 create mode 100644 benchmarks/inference/mii/run_benchmark.py
 delete mode 100644 benchmarks/inference/mii/run_benchmark_client.sh
 create mode 100644 benchmarks/inference/mii/utils.py

diff --git a/benchmarks/inference/mii/__init__.py b/benchmarks/inference/mii/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/client.py
similarity index 53%
rename from benchmarks/inference/mii/run_benchmark_client.py
rename to benchmarks/inference/mii/client.py
index caf20351e..c3072e1ad 100644
--- a/benchmarks/inference/mii/run_benchmark_client.py
+++ b/benchmarks/inference/mii/client.py
@@ -1,15 +1,10 @@
 import os
 import time
 import random
-import argparse
 import queue
 import multiprocessing
 import threading
-from statistics import mean
-from dataclasses import dataclass, asdict
 from typing import List, Iterable
-from pathlib import Path
-from datetime import datetime
 import numpy as np
 
 from transformers import AutoTokenizer
@@ -20,51 +15,9 @@
 import asyncio
 import requests
 
-from postprocess_results import get_summary, ResponseDetails
-
-MAX_PROMPT_LENGTH = 4000
-PROMPT_LENGTH_VAR = 0.3
-MAX_NEW_TOKENS_VAR = 0.3
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Benchmark MII services")
-    parser.add_argument("-k",
-                        "--max_new_tokens",
-                        type=int,
-                        default=60,
-                        help="min and max num tokens argument for huggingface")
-    parser.add_argument("-d",
-                        "--deployment_name",
-                        type=str,
-                        default="benchmark_deployment")
-    parser.add_argument("-n",
-                        "--num_queries",
-                        type=int,
-                        help="number of queries to run",
-                        default=10)
-    parser.add_argument("-w",
-                        "--warmup",
-                        type=int,
-                        help="number of queries for warming up",
-                        default=1)
-    parser.add_argument("-c",
-                        "--client_num",
-                        type=int,
-                        help="number of parallel client processes",
-                        default=2)
-    parser.add_argument("-l",
-                        "--prompt_length",
-                        type=int,
-                        default=2600)
-    parser.add_argument('--use_thread', action='store_true',
-                        help='use thread to run parallel clients, otherwise use multiprocessing',
-                        default=False)
-    parser.add_argument('--stream', action='store_true', default=True)
-    parser.add_argument('--vllm', action='store_true', default=False)
-    parser.add_argument('-o', '--out_json_path', type=Path, default=None)
-
-    args = parser.parse_args()
-    return args
+from postprocess_results import ResponseDetails
+
+from utils import parse_args, output_summary
 
 
 def call_mii(client, input_tokens, max_new_tokens, stream):
@@ -85,11 +38,10 @@ def callback(response):
     if stream:
         output_tokens = []
         client.generate(
-            input_tokens, max_new_tokens=max_new_tokens,
-            streaming_fn=callback)
+            input_tokens, max_new_tokens=max_new_tokens, streaming_fn=callback
+        )
     else:
-        result = client.generate(
-            input_tokens, max_new_tokens=max_new_tokens)
+        result = client.generate(input_tokens, max_new_tokens=max_new_tokens)
         output_tokens = result[0].generated_text
 
     return ResponseDetails(
@@ -98,7 +50,8 @@ def callback(response):
         start_time=start_time,
         end_time=time.time(),
         model_time=0,
-        token_gen_time=token_gen_time)
+        token_gen_time=token_gen_time,
+    )
 
 
 def call_vllm(input_tokens, max_new_tokens, stream=True):
@@ -114,15 +67,19 @@ def call_vllm(input_tokens, max_new_tokens, stream=True):
         "ignore_eos": False,
         "stream": stream,
     }
+
     def clear_line(n: int = 1) -> None:
-        LINE_UP = '\033[1A'
-        LINE_CLEAR = '\x1b[2K'
+        LINE_UP = "\033[1A"
+        LINE_CLEAR = "\x1b[2K"
         for _ in range(n):
             print(LINE_UP, end=LINE_CLEAR, flush=True)
 
-    def get_streaming_response(response: requests.Response, time_last_token) -> Iterable[List[str]]:
-        for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False,
-                                        delimiter=b"\0"):
+    def get_streaming_response(
+        response: requests.Response, time_last_token
+    ) -> Iterable[List[str]]:
+        for chunk in response.iter_lines(
+            chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+        ):
             if chunk:
                 data = json.loads(chunk.decode("utf-8"))
                 output = data["text"][0]
@@ -149,13 +106,23 @@ def get_response(response: requests.Response) -> List[str]:
             start_time=start_time,
             end_time=time.time(),
             model_time=0,
-            token_gen_time=token_gen_time)
+            token_gen_time=token_gen_time,
+        )
     else:
         output = get_response(response)
         raise NotImplementedError("Not implemented for non-streaming")
 
 
-def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm):
+def _run_parallel(
+    deployment_name,
+    warmup,
+    barrier,
+    query_queue,
+    result_queue,
+    num_clients,
+    stream,
+    vllm,
+):
     pid = os.getpid()
     session_id = f"test_session_p{pid}_t{threading.get_ident()}"
 
@@ -163,6 +130,7 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c
     asyncio.set_event_loop(event_loop)
     if not vllm:
         import mii
+
         client = mii.client(deployment_name)
 
     barrier.wait()
@@ -178,7 +146,7 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c
 
     barrier.wait()
 
-    time.sleep(random.uniform(0, client_num) * 0.01)
+    time.sleep(random.uniform(0, num_clients) * 0.01)
     try:
         while not query_queue.empty():
             print(f"queue size: {query_queue.qsize()} ({pid})", flush=True)
@@ -197,16 +165,30 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c
     print(f"Worker ({pid}) finished. session_id: {session_id}")
 
 
-def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_queries, warmup, stream, vllm, use_thread=False):
+def run_client(
+    num_clients,
+    model,
+    deployment_name,
+    mean_prompt_length,
+    mean_max_new_tokens,
+    num_requests,
+    warmup,
+    max_prompt_length,
+    prompt_length_var,
+    max_new_tokens_var,
+    stream,
+    vllm,
+    use_thread,
+):
     """
     Run MII client for benchmarking. The scenario is a bit complicated:
-    1. The main process puts `num_queries` queries into the input queue
+    1. The main process puts `num_requests` queries into the input queue
     2. Each client runs `warmup` iterations () taking the queries from the input queue
     3. --- barrier ---
     4. The main process marks the start time
-    5a. All clients send `num_queries' query in total and put the results into the result queue
+    5a. All clients send `num_requests' query in total and put the results into the result queue
     5b. The main process takes the results from the result queue (in parallel with 5a)
-    6. The main process marks the end time after receiving `num_queries' results
+    6. The main process marks the end time after receiving `num_requests' results
     """
 
     if use_thread:
@@ -218,23 +200,44 @@ def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_q
         barrier_cls = multiprocessing.Barrier
         queue_cls = multiprocessing.Queue
 
-    barrier = barrier_cls(client_num + 1)
+    barrier = barrier_cls(num_clients + 1)
     query_queue = queue_cls()
     result_queue = queue_cls()
 
-    processes = [runnable_cls(target=_run_parallel,
-                              args=(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm))
-                 for i in range(client_num)]
+    processes = [
+        runnable_cls(
+            target=_run_parallel,
+            args=(
+                deployment_name,
+                warmup,
+                barrier,
+                query_queue,
+                result_queue,
+                num_clients,
+                stream,
+                vllm,
+            ),
+        )
+        for i in range(num_clients)
+    ]
     for p in processes:
         p.start()
 
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+    tokenizer = AutoTokenizer.from_pretrained(model)
     query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42)
-    MAX_PROMPT_LENGTH = 4000
-    request_text = query_generator.get_random_request_text(prompt_length, prompt_length*PROMPT_LENGTH_VAR, MAX_PROMPT_LENGTH, num_queries + warmup*client_num)
+    request_text = query_generator.get_random_request_text(
+        mean_prompt_length,
+        mean_prompt_length * prompt_length_var,
+        max_prompt_length,
+        num_requests + warmup * num_clients,
+    )
 
     for t in request_text:
-        req_max_new_tokens = int(np.random.normal(max_new_tokens, MAX_NEW_TOKENS_VAR*max_new_tokens))
+        req_max_new_tokens = int(
+            np.random.normal(
+                mean_max_new_tokens, max_new_tokens_var * mean_max_new_tokens
+            )
+        )
         query_queue.put((t, req_max_new_tokens))
 
     # Tokenizers must be initialized after fork.
@@ -245,41 +248,37 @@ def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_q
     barrier.wait()
 
     response_details = []
-    while len(response_details) < num_queries:
+    while len(response_details) < num_requests:
         res = result_queue.get()
         # vLLM returns concatinated tokens
         if vllm:
             all_tokens = tokenizer.tokenize(res.generated_tokens)
-            res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)):]
+            res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)) :]
         response_details.append(res)
 
     return response_details
 
+
 if __name__ == "__main__":
-    args = parse_args()
-    print(args)
+    args = parse_args(client_args=True)
 
     if args.out_json_path is not None and not args.out_json_path.parent.exists():
         raise ValueError(f"Parent directory of {args.out_json_path}")
 
-    response_details = run_client(args.client_num, args.deployment_name,
-                            args.prompt_length,
-                            args.max_new_tokens, args.num_queries, args.warmup,
-                            args.stream, args.vllm, args.use_thread)
-
-    args_dict = vars(args)
-    ps = get_summary(args_dict, response_details)
-    print(f"Deployment: {args.deployment_name} Clients: {args.client_num}, "
-          + f"Prompt (mean): {args.prompt_length} tokens, "
-          + f"Generation (mean): {args.max_new_tokens} tokens, "
-          + f"Query throughput: {ps.throughput:.3f} queries/s, "
-          + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, "
-          + f"Query latency: {ps.latency:.3f} s, "
-          + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, "
-          + f"First token received: {ps.first_token_latency:.3f} s")
-
-    if args.out_json_path is not None:
-        with open(args.out_json_path, "w") as f:
-            args_dict["out_json_path"] = str(args.out_json_path) # Path is not JSON serializable
-            data = {"args": args_dict, "time": str(datetime.now()), "response_details": [asdict(r) for r in response_details]}
-            json.dump(data, f, indent=2)
+    response_details = run_client(
+        num_clients=args.num_clients,
+        model=args.model,
+        deployment_name=args.deployment_name,
+        mean_prompt_length=args.mean_prompt_length,
+        mean_max_new_tokens=args.mean_max_new_tokens,
+        num_requests=args.num_requests,
+        warmup=args.warmup,
+        max_prompt_length=args.max_prompt_length,
+        prompt_length_var=args.prompt_length_var,
+        max_new_tokens_var=args.max_new_tokens_var,
+        stream=args.stream,
+        vllm=args.vllm,
+        use_thread=args.use_thread,
+    )
+
+    output_summary(args, response_details)
diff --git a/benchmarks/inference/mii/postprocess_results.py b/benchmarks/inference/mii/postprocess_results.py
index cb2000d5f..b898f7c8b 100644
--- a/benchmarks/inference/mii/postprocess_results.py
+++ b/benchmarks/inference/mii/postprocess_results.py
@@ -31,10 +31,10 @@ class ProfilingSummary:
     first_token_latency: float
     tokens_per_sec: float
 
-    
+
 def parse_args():
     parser = argparse.ArgumentParser(description="Postprocess results")
-    parser.add_argument('-i', '--input_path', type=Path, default="results.json")
+    parser.add_argument("-i", "--input_path", type=Path, default="results.json")
 
     args = parser.parse_args()
     return args
@@ -44,13 +44,13 @@ def get_tokenizer():
     global tokenizer
     if tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-    return tokenizer    
+    return tokenizer
 
 
 def read_json(file_path):
-    with open(file_path, 'r') as f:
+    with open(file_path, "r") as f:
         data = json.load(f)
-    
+
     args = data["args"]
 
     response_details = []
@@ -61,34 +61,56 @@ def read_json(file_path):
 
 
 def get_summary(args, response_details):
-    client_num = args["client_num"]
+    num_clients = args["num_clients"]
 
     # Calculate latency and throughput using P95 latency
     latency = mean([r.end_time - r.start_time for r in response_details])
-    throughput = client_num / latency
-
-    tokens_per_sec = mean([(len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details])
+    throughput = num_clients / latency
+
+    tokens_per_sec = mean(
+        [
+            (len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens))
+            / (r.end_time - r.start_time)
+            for r in response_details
+        ]
+    )
     first_token_latency = mean([r.token_gen_time[0] for r in response_details])
 
-    token_gen_latency_flat = reduce(list.__add__, [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2])
+    token_gen_latency_flat = reduce(
+        list.__add__,
+        [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2],
+    )
     token_gen_latency = mean([t for t in token_gen_latency_flat])
 
-    return ProfilingSummary(throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec)
+    return ProfilingSummary(
+        throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec
+    )
 
 
-def get_token_latency(response_details, percentile=None, variance=False, cumulative=False):
+def get_token_latency(
+    response_details, percentile=None, variance=False, cumulative=False
+):
     req_latencies = [r.token_gen_time for r in response_details]
     if cumulative:
-        req_latencies = [np.cumsum(np.array(r.token_gen_time)).tolist() for r in response_details]
+        req_latencies = [
+            np.cumsum(np.array(r.token_gen_time)).tolist() for r in response_details
+        ]
     max_gen_length = max([len(r.generated_tokens) for r in response_details])
     latency = []
     for i in range(max_gen_length):
         if variance:
-            token_latency_step = np.var([latency[i] for latency in req_latencies if len(latency) > i])    
+            token_latency_step = np.var(
+                [latency[i] for latency in req_latencies if len(latency) > i]
+            )
         if percentile is None:
-            token_latency_step = [latency[i] for latency in req_latencies if len(latency) > i]
+            token_latency_step = [
+                latency[i] for latency in req_latencies if len(latency) > i
+            ]
         else:
-            token_latency_step = np.percentile([latency[i] for latency in req_latencies if len(latency) > i], percentile)
+            token_latency_step = np.percentile(
+                [latency[i] for latency in req_latencies if len(latency) > i],
+                percentile,
+            )
 
         latency.append(token_latency_step)
 
@@ -104,9 +126,11 @@ def get_token_acc_latency(response_details, percentile=99):
     prof_args, response_details = read_json(args.input_path)
 
     ps = get_summary(prof_args, response_details)
-    print(f"Deployment: {prof_args['deployment_name']} Clients: {prof_args['client_num']}, "
-          + f"Query throughput: {ps.throughput:.3f} queries/s, "
-          + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, "
-          + f"Query latency: {ps.latency:.3f} s, "
-          + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, "
-          + f"First token received: {ps.first_token_latency:.3f} s")
+    print(
+        f"Deployment: {prof_args['deployment_name']} Clients: {prof_args['num_clients']}, "
+        + f"Query throughput: {ps.throughput:.3f} queries/s, "
+        + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, "
+        + f"Query latency: {ps.latency:.3f} s, "
+        + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, "
+        + f"First token received: {ps.first_token_latency:.3f} s"
+    )
diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
index ca504a6c9..cbec569d9 100644
--- a/benchmarks/inference/mii/run_all.sh
+++ b/benchmarks/inference/mii/run_all.sh
@@ -1,25 +1,6 @@
-RAGGED_BATCH_SIZE=768
-PARAM_SIZES=(7b 13b 70b)
+MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-180B)
 
-declare -A TP_SIZES
-TP_SIZES["7b"]="1"
-TP_SIZES["13b"]="1:2:4"
-TP_SIZES["70b"]="4:8"
-
-for PARAM_SIZE in ${PARAM_SIZES[@]}; do
-    
-    IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
-    for TP in ${TP_VALUES[@]}; do
-        DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}
-        python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -b ${RAGGED_BATCH_SIZE} start
-
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
-
-        echo "Stopping server"
-        python server.py -d ${DEPLOYMENT_NAME} stop
-        sleep 120
-    done
+for MODEL in ${MODELS[@]}; do
+    python run_benchmark.py --model ${MODEL} --use_defaults --stream
+    python run_benchmark.py --model ${MODEL} --use_defaults --stream --vllm
 done
diff --git a/benchmarks/inference/mii/run_all_vllm.sh b/benchmarks/inference/mii/run_all_vllm.sh
deleted file mode 100644
index 572377f13..000000000
--- a/benchmarks/inference/mii/run_all_vllm.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-RAGGED_BATCH_SIZE=768
-PARAM_SIZES=(7b 13b 70b)
-
-declare -A TP_SIZES
-TP_SIZES["7b"]="1"
-TP_SIZES["13b"]="1:2:4"
-TP_SIZES["70b"]="4:8"
-
-for PARAM_SIZE in ${PARAM_SIZES[@]}; do
-    
-    IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
-    for TP in ${TP_VALUES[@]}; do
-        DEPLOYMENT_NAME=vllm-llama2-${PARAM_SIZE}-tp${TP}
-        python -m vllm.entrypoints.api_server --host 127.0.0.1 --port 26500 --tensor-parallel-size ${TP} --model meta-llama/Llama-2-${PARAM_SIZE}-hf &
-        sleep 60
-
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh
-        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh
-
-        echo "Stopping server"
-        pkill -u ${USER} -f vllm.entrypoints.api_server
-        sleep 30
-    done
-done
diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py
new file mode 100644
index 000000000..90ac9b986
--- /dev/null
+++ b/benchmarks/inference/mii/run_benchmark.py
@@ -0,0 +1,96 @@
+import argparse
+import itertools
+from typing import Dict, List
+
+from utils import parse_args, get_deployment_name, output_summary
+from server import start_server, stop_server
+from client import run_client
+
+MODEL_DEFAULTS = {
+    "meta-llama/Llama-2-7b-hf": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (1,),
+        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
+    },
+    "meta-llama/Llama-13b-hf": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (1, 2, 4),
+        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
+    },
+    "meta-llama/Llama-2-70b-hf": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (4, 8),
+        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
+    },
+    "tiiuae/falcon-180B": {
+        "max_prompt_length": 2000,
+        "mean_prompt_length": (1200, 1900),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (8,),
+        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
+    },
+}
+
+
+def get_args_product(args: argparse.Namespace, which: List[str] = None) -> List[Dict]:
+    if which is None:
+        which = list(vars(args).keys())
+    arg_values_product = itertools.product(*[getattr(args, k) for k in which])
+    return [
+        {k: v for k, v in zip(which, arg_values)} for arg_values in arg_values_product
+    ]
+
+
+def run_benchmark() -> None:
+    args = parse_args(server_args=True, client_args=True)
+
+    if args.use_defaults:
+        for k, v in MODEL_DEFAULTS[args.model].items():
+            setattr(args, k, v)
+
+    # Args to enumerate over for benchmarks
+    server_arg_names = ["tp_size", "max_ragged_batch_size", "num_replicas"]
+    client_arg_names = [
+        "mean_prompt_length",
+        "mean_max_new_tokens",
+        "num_clients",
+        "num_requests",
+    ]
+
+    # Run MII benchmarks
+    for server_args in get_args_product(args, which=server_arg_names):
+        if args.deployment_name is None:
+            args.deployment_name = get_deployment_name(model=args.model, **server_args)
+        start_server(
+            model=args.model,
+            deployment_name=args.deployment_name,
+            vllm=args.vllm,
+            **server_args,
+        )
+
+        for client_args in get_args_product(args, which=client_arg_names):
+            response_details = run_client(
+                model=args.model,
+                deployment_name=args.deployment_name,
+                max_prompt_length=args.max_prompt_length,
+                prompt_length_var=args.prompt_length_var,
+                max_new_tokens_var=args.max_new_tokens_var,
+                warmup=args.warmup,
+                use_thread=args.use_thread,
+                stream=args.stream,
+                vllm=args.vllm,
+                **client_args,
+            )
+            output_summary(args, response_details)
+
+        stop_server(deployment_name=args.deployment_name, vllm=args.vllm)
+
+
+if __name__ == "__main__":
+    run_benchmark()
diff --git a/benchmarks/inference/mii/run_benchmark_client.sh b/benchmarks/inference/mii/run_benchmark_client.sh
deleted file mode 100644
index 318e9092e..000000000
--- a/benchmarks/inference/mii/run_benchmark_client.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-DEPLOYMENT_NAME=${DEPLOYMENT_NAME:-llama2-7b}
-VLLM=${VLLM:-""}
-
-CLIENT_NUMS=${CLIENT_NUMS:-1 2 4 6 8 12 16 20 24 28 32}
-MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-60}
-PROMPT_LENGTH=${PROMPT_LENGTH:-3072}
-REQUEST_NUM=${REQUEST_NUM:-512}
-
-LOG_DIR=logs.${DEPLOYMENT_NAME}
-mkdir -p ${LOG_DIR}
-
-for client_num in ${CLIENT_NUMS[@]}; do
-    RESULT_FILE=${DEPLOYMENT_NAME}_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.json
-
-    python run_benchmark_client.py -w 1 \
-        -d ${DEPLOYMENT_NAME} -n ${REQUEST_NUM} -c ${client_num} \
-        -k ${MAX_NEW_TOKENS} -l ${PROMPT_LENGTH} \
-        -o ${LOG_DIR}/${RESULT_FILE} \
-        ${VLLM} --stream \
-        2>&1 | tee ${LOG_DIR}/bench_client_num_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.log 
-done
diff --git a/benchmarks/inference/mii/server.py b/benchmarks/inference/mii/server.py
index 2e6164187..c10c106b3 100644
--- a/benchmarks/inference/mii/server.py
+++ b/benchmarks/inference/mii/server.py
@@ -2,82 +2,118 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-import mii
-import argparse
-
-from mii.constants import DeploymentType
+import subprocess
+import time
 
+import mii
 from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig
 from deepspeed.inference.v2.ragged import DSStateManagerConfig
 
-def start_server(model_name,
-                 deployment_name,
-                 task,
-                 tensor_parallel,
-                 replica_num,
-                 max_ragged_batch_size):
-    tp_config = DeepSpeedTPConfig(tp_size=tensor_parallel)
-    mgr_config = DSStateManagerConfig(max_ragged_batch_size=max_ragged_batch_size, max_ragged_sequence_count=max_ragged_batch_size)
-    inference_config = RaggedInferenceEngineConfig(tensor_parallel=tp_config,
-                                                   state_manager=mgr_config)
+from utils import parse_args
+
+
+def start_server(
+    model, deployment_name, tp_size, num_replicas, max_ragged_batch_size, vllm
+):
+    if vllm:
+        start_vllm_server(model=model, tp_size=tp_size)
+    else:
+        start_mii_server(
+            model=model,
+            deployment_name=deployment_name,
+            tp_size=tp_size,
+            num_replicas=num_replicas,
+            max_ragged_batch_size=max_ragged_batch_size,
+        )
+
+
+def start_vllm_server(model: str, tp_size: int) -> None:
+    vllm_cmd = (
+        "python",
+        "-m",
+        "vllm.entrypoints.api_server",
+        "--host",
+        "127.0.0.1",
+        "--port",
+        "26500",
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--model",
+        model,
+    )
+    p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    start_time = time.time()
+    timeout_after = 60 * 2  # 2 minutes
+    while True:
+        line = p.stderr.readline().decode("utf-8")
+        if "Application startup complete" in line:
+            break
+        time.sleep(1)
+        if time.time() - start_time > timeout_after:
+            p.terminate()
+            stop_vllm_server()
+            raise TimeoutError("Timed out waiting for VLLM server to start")
+
+
+def start_mii_server(
+    model, deployment_name, tp_size, num_replicas, max_ragged_batch_size
+):
+    tp_config = DeepSpeedTPConfig(tp_size=tp_size)
+    mgr_config = DSStateManagerConfig(
+        max_ragged_batch_size=max_ragged_batch_size,
+        max_ragged_sequence_count=max_ragged_batch_size,
+    )
+    inference_config = RaggedInferenceEngineConfig(
+        tensor_parallel=tp_config, state_manager=mgr_config
+    )
 
     mii.serve(
-        model_name,
+        model,
         deployment_name=deployment_name,
-        tensor_parallel=tensor_parallel,
-        task=task,
+        tensor_parallel=tp_size,
         inference_engine_config=inference_config,
-        replica_num=replica_num
+        replica_num=num_replicas,
     )
 
-def stop_server(deployment_name):
-    mii.client(deployment_name).terminate_server()
 
+def stop_server(deployment_name, vllm):
+    if vllm:
+        stop_vllm_server()
+    else:
+        stop_mii_server(deployment_name)
 
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name",
-                        type=str,
-                        default="meta-llama/Llama-2-7b-hf",
-                        help="Name of the model in the model_files to benchmark")
-    parser.add_argument("-d",
-                        "--deployment_name",
-                        type=str,
-                        default="benchmark_deployment")
-    parser.add_argument("-t", "--task", type=str,
-                        help="Task type. Currently only text-generation is supported",
-                        default="text-generation")
-    parser.add_argument("-m",
-                        "--tensor_parallel",
-                        type=int,
-                        help="Degree of tensor (model) parallelism",
-                        default=1)
-    parser.add_argument("-b",
-                        "--ragged_batch_size",
-                        type=int,
-                        help="Max batch size for ragged batching",
-                        default=768)
-    parser.add_argument("-r",
-                        "--replica_num",
-                        type=int,
-                        help="Number of replicas for load balancing",
-                        default=1)
-    parser.add_argument("cmd", help="start, stop, or restart")
-    return parser.parse_args()
+
+def stop_vllm_server():
+    vllm_cmd = ("pkill", "-f", "vllm.entrypoints.api_server")
+    p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    p.wait()
+
+
+def stop_mii_server(deployment_name):
+    mii.client(deployment_name).terminate_server()
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    args = parse_args(server_args=True)
 
     if args.cmd == "start":
-        start_server(args.model_name,
-                     args.deployment_name,
-                     args.task,
-                     args.tensor_parallel,
-                     args.replica_num,
-                     args.ragged_batch_size)
+        start_server(
+            model=args.model,
+            deployment_name=args.deployment_name,
+            tp_size=args.tp_size,
+            num_replicas=args.num_replicas,
+            max_ragged_batch_size=args.max_ragged_batch_size,
+            vllm=args.vllm,
+        )
     elif args.cmd == "stop":
-        print("running stop")
-        stop_server(args.deployment_name)
+        stop_server(deployment_name=args.deployment_name, vllm=args.vllm)
     else:
-        raise ValueError(f"Unknown command: {args.cmd}")
+        stop_server(deployment_name=args.deployment_name, vllm=args.vllm)
+        start_server(
+            model=args.model,
+            deployment_name=args.deployment_name,
+            tp_size=args.tp_size,
+            num_replicas=args.num_replicas,
+            max_ragged_batch_size=args.max_ragged_batch_size,
+            vllm=args.vllm,
+        )
diff --git a/benchmarks/inference/mii/utils.py b/benchmarks/inference/mii/utils.py
new file mode 100644
index 000000000..94a3df9f0
--- /dev/null
+++ b/benchmarks/inference/mii/utils.py
@@ -0,0 +1,131 @@
+import argparse
+from pathlib import Path
+from postprocess_results import get_summary, ResponseDetails
+import json
+from datetime import datetime
+from dataclasses import asdict
+
+# For these arguments, users can provide multiple values when running the
+# benchmark. The benchmark will iterate over all possible combinations.
+SERVER_PARAMS = ["tp_size", "max_ragged_batch_size", "replica_num"]
+CLIENT_PARAMS = [
+    "mean_prompt_length",
+    "mean_max_new_tokens",
+    "num_clients",
+    "num_requests",
+]
+
+
+def parse_args(
+    server_args: bool = False, client_args: bool = False
+) -> argparse.Namespace:
+    if not (server_args or client_args):
+        raise ValueError("Must specify server_args or client_args or both")
+
+    # Server args
+    server_parser = argparse.ArgumentParser(add_help=False)
+    server_parser.add_argument("--tp_size", type=int, nargs="+", default=[1])
+    server_parser.add_argument(
+        "--max_ragged_batch_size", type=int, nargs="+", default=[768]
+    )
+    server_parser.add_argument("--num_replicas", type=int, nargs="+", default=[1])
+    server_parser.add_argument(
+        "--cmd", type=str, choices=["start", "stop", "restart"], default="start"
+    )
+
+    # Client args
+    client_parser = argparse.ArgumentParser(add_help=False)
+    client_parser.add_argument(
+        "--mean_prompt_length", type=int, nargs="+", default=[2600]
+    )
+    client_parser.add_argument(
+        "--mean_max_new_tokens", type=int, nargs="+", default=[60]
+    )
+    client_parser.add_argument("--num_clients", type=int, nargs="+", default=[2])
+    client_parser.add_argument("--num_requests", type=int, nargs="+", default=[512])
+    client_parser.add_argument("--max_prompt_length", type=int, default=4000)
+    client_parser.add_argument("--prompt_length_var", type=float, default=0.3)
+    client_parser.add_argument("--max_new_tokens_var", type=float, default=0.3)
+    client_parser.add_argument("--warmup", type=int, default=1)
+    client_parser.add_argument("--use_thread", action="store_true")
+    client_parser.add_argument("--stream", action="store_true")
+    client_parser.add_argument("--out_json_path", type=Path, default=None)
+
+    # Create the parser, inheriting from the server and/or client parsers
+    parents = []
+    if server_args:
+        parents.append(server_parser)
+    if client_args:
+        parents.append(client_parser)
+
+    # Common args
+    parser = argparse.ArgumentParser(parents=parents)
+    parser.add_argument("--model", type=str, default="meta-llama/Llama-2-7b-hf")
+    parser.add_argument("--deployment_name", type=str, default=None)
+    parser.add_argument("--vllm", action="store_true")
+    parser.add_argument("--use_defaults", action="store_true")
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    if server_args and not client_args:
+        # If running server, make sure only single values were passed for parameters
+        for param in SERVER_PARAMS:
+            if len(getattr(args, param)) > 1:
+                raise ValueError(
+                    f"Cannot specify multiple values for {param} when running server"
+                )
+            setattr(args, param, getattr(args, param)[0])
+
+    if client_args and not server_args:
+        # If running client, make sure only single values were passed for parameters
+        for param in CLIENT_PARAMS:
+            if len(getattr(args, param)) > 1:
+                raise ValueError(
+                    f"Cannot specify multiple values for {param} when running client"
+                )
+            setattr(args, param, getattr(args, param)[0])
+
+    if not (client_args and server_args):
+        # Generate deployment name if not provided
+        if args.deployment_name is None:
+            args.deployment_name = get_deployment_name(
+                model=args.model,
+                tp_size=args.tp_size,
+                max_ragged_batch_size=args.max_ragged_batch_size,
+            )
+
+    return args
+
+
+def get_deployment_name(
+    model: str, tp_size: int, max_ragged_batch_size: int, num_replicas: int
+) -> str:
+    return f"{model}-tp{tp_size}-b{max_ragged_batch_size}-r{num_replicas}"
+
+
+def output_summary(args, response_details):
+    args_dict = vars(args)
+    ps = get_summary(args_dict, response_details)
+    print(
+        f"Deployment: {args.deployment_name} Clients: {args.num_clients}, "
+        + f"Prompt (mean): {args.mean_prompt_length} tokens, "
+        + f"Generation (mean): {args.mean_max_new_tokens} tokens, "
+        + f"Query throughput: {ps.throughput:.3f} queries/s, "
+        + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, "
+        + f"Query latency: {ps.latency:.3f} s, "
+        + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, "
+        + f"First token received: {ps.first_token_latency:.3f} s"
+    )
+
+    if args.out_json_path is not None:
+        with open(args.out_json_path, "w") as f:
+            args_dict["out_json_path"] = str(
+                args.out_json_path
+            )  # Path is not JSON serializable
+            data = {
+                "args": args_dict,
+                "time": str(datetime.now()),
+                "response_details": [asdict(r) for r in response_details],
+            }
+            json.dump(data, f, indent=2)

From 5b42685b82a4d9253c929170f4e2a9f7400a0892 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 12:40:43 -0800
Subject: [PATCH 02/14] fix some bugs, add model defaults

---
 benchmarks/inference/mii/client.py         |  52 ++++------
 benchmarks/inference/mii/model_defaults.py |  44 +++++++++
 benchmarks/inference/mii/run_all.sh        |  10 +-
 benchmarks/inference/mii/run_benchmark.py  | 100 ++++---------------
 benchmarks/inference/mii/run_example.sh    |  15 ++-
 benchmarks/inference/mii/server.py         |  51 +++++-----
 benchmarks/inference/mii/utils.py          | 110 +++++++++++----------
 7 files changed, 185 insertions(+), 197 deletions(-)
 create mode 100644 benchmarks/inference/mii/model_defaults.py

diff --git a/benchmarks/inference/mii/client.py b/benchmarks/inference/mii/client.py
index c3072e1ad..cbfb2b582 100644
--- a/benchmarks/inference/mii/client.py
+++ b/benchmarks/inference/mii/client.py
@@ -17,7 +17,7 @@
 
 from postprocess_results import ResponseDetails
 
-from utils import parse_args, output_summary
+from utils import parse_args, output_summary, get_args_product, CLIENT_PARAMS
 
 
 def call_mii(client, input_tokens, max_new_tokens, stream):
@@ -165,21 +165,7 @@ def _run_parallel(
     print(f"Worker ({pid}) finished. session_id: {session_id}")
 
 
-def run_client(
-    num_clients,
-    model,
-    deployment_name,
-    mean_prompt_length,
-    mean_max_new_tokens,
-    num_requests,
-    warmup,
-    max_prompt_length,
-    prompt_length_var,
-    max_new_tokens_var,
-    stream,
-    vllm,
-    use_thread,
-):
+def run_client(args):
     """
     Run MII client for benchmarking. The scenario is a bit complicated:
     1. The main process puts `num_requests` queries into the input queue
@@ -191,6 +177,21 @@ def run_client(
     6. The main process marks the end time after receiving `num_requests' results
     """
 
+    # Unpack arguments
+    model = args.model
+    deployment_name = args.deployment_name
+    mean_prompt_length = args.mean_prompt_length
+    mean_max_new_tokens = args.mean_max_new_tokens
+    num_clients = args.num_clients
+    num_requests = args.num_requests
+    warmup = args.warmup
+    max_prompt_length = args.max_prompt_length
+    prompt_length_var = args.prompt_length_var
+    max_new_tokens_var = args.max_new_tokens_var
+    stream = args.stream
+    vllm = args.vllm
+    use_thread = args.use_thread
+
     if use_thread:
         runnable_cls = threading.Thread
         barrier_cls = threading.Barrier
@@ -265,20 +266,7 @@ def run_client(
     if args.out_json_path is not None and not args.out_json_path.parent.exists():
         raise ValueError(f"Parent directory of {args.out_json_path}")
 
-    response_details = run_client(
-        num_clients=args.num_clients,
-        model=args.model,
-        deployment_name=args.deployment_name,
-        mean_prompt_length=args.mean_prompt_length,
-        mean_max_new_tokens=args.mean_max_new_tokens,
-        num_requests=args.num_requests,
-        warmup=args.warmup,
-        max_prompt_length=args.max_prompt_length,
-        prompt_length_var=args.prompt_length_var,
-        max_new_tokens_var=args.max_new_tokens_var,
-        stream=args.stream,
-        vllm=args.vllm,
-        use_thread=args.use_thread,
-    )
+    for client_args in get_args_product(args, which=CLIENT_PARAMS):
+        response_details = run_client(client_args)
 
-    output_summary(args, response_details)
+        output_summary(client_args, response_details)
diff --git a/benchmarks/inference/mii/model_defaults.py b/benchmarks/inference/mii/model_defaults.py
new file mode 100644
index 000000000..d201d3f2a
--- /dev/null
+++ b/benchmarks/inference/mii/model_defaults.py
@@ -0,0 +1,44 @@
+MODEL_DEFAULTS = {
+    "meta-llama/Llama-2-7b-hf": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (1,),
+    },
+    "meta-llama/Llama-13b-hf": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (1, 2, 4),
+    },
+    "meta-llama/Llama-2-70b-hf": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (4, 8),
+    },
+    "tiiuae/falcon-40B": {
+        "max_prompt_length": 2000,
+        "mean_prompt_length": (1200, 1900),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (2, 4),
+    },
+    "tiiuae/falcon-180B": {
+        "max_prompt_length": 2000,
+        "mean_prompt_length": (1200, 1900),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (8,),
+    },
+    "microsoft/phi-2": {
+        "max_prompt_length": 2000,
+        "mean_prompt_length": (1200, 1900),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (1,),
+    },
+    "mistralai/Mixtral-8x7B-v0.1": {
+        "max_prompt_length": 4000,
+        "mean_prompt_length": (1200, 2600),
+        "mean_max_new_tokens": (60, 128),
+        "tp_size": (4,),
+    },
+}
diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
index cbec569d9..9048c50a9 100644
--- a/benchmarks/inference/mii/run_all.sh
+++ b/benchmarks/inference/mii/run_all.sh
@@ -1,6 +1,10 @@
-MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-180B)
+MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
 
 for MODEL in ${MODELS[@]}; do
-    python run_benchmark.py --model ${MODEL} --use_defaults --stream
-    python run_benchmark.py --model ${MODEL} --use_defaults --stream --vllm
+    python run_benchmark.py --model ${MODEL} --stream
+    python run_benchmark.py --model ${MODEL} --stream --vllm
 done
+
+# Extra runs for Mixtral with non-default settings
+python run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024
+python run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py
index 90ac9b986..66814969e 100644
--- a/benchmarks/inference/mii/run_benchmark.py
+++ b/benchmarks/inference/mii/run_benchmark.py
@@ -1,95 +1,35 @@
-import argparse
-import itertools
-from typing import Dict, List
-
-from utils import parse_args, get_deployment_name, output_summary
+from utils import (
+    parse_args,
+    output_summary,
+    get_args_product,
+    SERVER_PARAMS,
+    CLIENT_PARAMS,
+)
 from server import start_server, stop_server
 from client import run_client
-
-MODEL_DEFAULTS = {
-    "meta-llama/Llama-2-7b-hf": {
-        "max_prompt_length": 4000,
-        "mean_prompt_length": (1200, 2600),
-        "mean_max_new_tokens": (60, 128),
-        "tp_size": (1,),
-        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
-    },
-    "meta-llama/Llama-13b-hf": {
-        "max_prompt_length": 4000,
-        "mean_prompt_length": (1200, 2600),
-        "mean_max_new_tokens": (60, 128),
-        "tp_size": (1, 2, 4),
-        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
-    },
-    "meta-llama/Llama-2-70b-hf": {
-        "max_prompt_length": 4000,
-        "mean_prompt_length": (1200, 2600),
-        "mean_max_new_tokens": (60, 128),
-        "tp_size": (4, 8),
-        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
-    },
-    "tiiuae/falcon-180B": {
-        "max_prompt_length": 2000,
-        "mean_prompt_length": (1200, 1900),
-        "mean_max_new_tokens": (60, 128),
-        "tp_size": (8,),
-        "num_clients": (1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32),
-    },
-}
-
-
-def get_args_product(args: argparse.Namespace, which: List[str] = None) -> List[Dict]:
-    if which is None:
-        which = list(vars(args).keys())
-    arg_values_product = itertools.product(*[getattr(args, k) for k in which])
-    return [
-        {k: v for k, v in zip(which, arg_values)} for arg_values in arg_values_product
-    ]
+from model_defaults import MODEL_DEFAULTS
 
 
 def run_benchmark() -> None:
     args = parse_args(server_args=True, client_args=True)
 
-    if args.use_defaults:
+    if not args.no_model_defaults:
+        if args.model not in MODEL_DEFAULTS:
+            raise ValueError(
+                f"Model {args.model} not in MODEL_DEFAULTS. "
+                f"Please specify arguments manually and use the --no_model_defaults flag."
+            )
         for k, v in MODEL_DEFAULTS[args.model].items():
             setattr(args, k, v)
 
-    # Args to enumerate over for benchmarks
-    server_arg_names = ["tp_size", "max_ragged_batch_size", "num_replicas"]
-    client_arg_names = [
-        "mean_prompt_length",
-        "mean_max_new_tokens",
-        "num_clients",
-        "num_requests",
-    ]
-
-    # Run MII benchmarks
-    for server_args in get_args_product(args, which=server_arg_names):
-        if args.deployment_name is None:
-            args.deployment_name = get_deployment_name(model=args.model, **server_args)
-        start_server(
-            model=args.model,
-            deployment_name=args.deployment_name,
-            vllm=args.vllm,
-            **server_args,
-        )
+    for server_args in get_args_product(args, which=SERVER_PARAMS):
+        start_server(server_args)
 
-        for client_args in get_args_product(args, which=client_arg_names):
-            response_details = run_client(
-                model=args.model,
-                deployment_name=args.deployment_name,
-                max_prompt_length=args.max_prompt_length,
-                prompt_length_var=args.prompt_length_var,
-                max_new_tokens_var=args.max_new_tokens_var,
-                warmup=args.warmup,
-                use_thread=args.use_thread,
-                stream=args.stream,
-                vllm=args.vllm,
-                **client_args,
-            )
-            output_summary(args, response_details)
+        for client_args in get_args_product(server_args, which=CLIENT_PARAMS):
+            response_details = run_client(client_args)
+            output_summary(client_args, response_details)
 
-        stop_server(deployment_name=args.deployment_name, vllm=args.vllm)
+        stop_server(server_args)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index ece8393ed..d4eff5494 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -1,11 +1,16 @@
 ### Run the server
-RAGGED_BATCH_SIZE=768
-PARAM_SIZES=(7b)
-DEPLOYMENT_NAME=llama2-7b-tp1-b768
-python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start
+python server.py \
+        --model meta-llama/Llama-2-7b-hf \
+        --deployment_name llama2-7b-tp1-b768 \
+        --tp_size 1 \
+        --max_ragged_batch_size 768 \
+        start
 
 ### This command will run the client with 60 generation steps and input prompt length of 2600
-DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
+python client.py \
+        --deployment_name llama2-7b-tp1-b768 \
+        --mean_prompt_length 2600 \
+        --mean_max_new_tokens 60
 
 ### Stop the server
 echo "Stopping server"
diff --git a/benchmarks/inference/mii/server.py b/benchmarks/inference/mii/server.py
index c10c106b3..7eb6b7fe2 100644
--- a/benchmarks/inference/mii/server.py
+++ b/benchmarks/inference/mii/server.py
@@ -9,12 +9,17 @@
 from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig
 from deepspeed.inference.v2.ragged import DSStateManagerConfig
 
-from utils import parse_args
+from utils import parse_args, SERVER_PARAMS
 
 
-def start_server(
-    model, deployment_name, tp_size, num_replicas, max_ragged_batch_size, vllm
-):
+def start_server(args):
+    vllm = args.vllm
+    model = args.model
+    deployment_name = args.deployment_name
+    tp_size = args.tp_size
+    num_replicas = args.num_replicas
+    max_ragged_batch_size = args.max_ragged_batch_size
+
     if vllm:
         start_vllm_server(model=model, tp_size=tp_size)
     else:
@@ -76,7 +81,10 @@ def start_mii_server(
     )
 
 
-def stop_server(deployment_name, vllm):
+def stop_server(args):
+    vllm = args.vllm
+    deployment_name = args.deployment_name
+
     if vllm:
         stop_vllm_server()
     else:
@@ -96,24 +104,21 @@ def stop_mii_server(deployment_name):
 if __name__ == "__main__":
     args = parse_args(server_args=True)
 
+    # Make sure only single values were passed for parameters, multiple values
+    # can be used with the run_benchmark.py script
+    for param in SERVER_PARAMS:
+        if len(getattr(args, param)) > 1:
+            raise ValueError(
+                f"Cannot specify multiple values for {param} when running server"
+            )
+        setattr(args, param, getattr(args, param)[0])
+
     if args.cmd == "start":
-        start_server(
-            model=args.model,
-            deployment_name=args.deployment_name,
-            tp_size=args.tp_size,
-            num_replicas=args.num_replicas,
-            max_ragged_batch_size=args.max_ragged_batch_size,
-            vllm=args.vllm,
-        )
+        start_server(args)
     elif args.cmd == "stop":
-        stop_server(deployment_name=args.deployment_name, vllm=args.vllm)
+        stop_server(args)
+    elif args.cmd == "restart":
+        stop_server(args)
+        start_server(args)
     else:
-        stop_server(deployment_name=args.deployment_name, vllm=args.vllm)
-        start_server(
-            model=args.model,
-            deployment_name=args.deployment_name,
-            tp_size=args.tp_size,
-            num_replicas=args.num_replicas,
-            max_ragged_batch_size=args.max_ragged_batch_size,
-            vllm=args.vllm,
-        )
+        raise ValueError(f"Invalid command {args.cmd}")
diff --git a/benchmarks/inference/mii/utils.py b/benchmarks/inference/mii/utils.py
index 94a3df9f0..d1c638435 100644
--- a/benchmarks/inference/mii/utils.py
+++ b/benchmarks/inference/mii/utils.py
@@ -4,16 +4,15 @@
 import json
 from datetime import datetime
 from dataclasses import asdict
+import itertools
+from typing import Iterator, List
+import copy
+import os
 
 # For these arguments, users can provide multiple values when running the
 # benchmark. The benchmark will iterate over all possible combinations.
-SERVER_PARAMS = ["tp_size", "max_ragged_batch_size", "replica_num"]
-CLIENT_PARAMS = [
-    "mean_prompt_length",
-    "mean_max_new_tokens",
-    "num_clients",
-    "num_requests",
-]
+SERVER_PARAMS = ["tp_size", "max_ragged_batch_size", "num_replicas"]
+CLIENT_PARAMS = ["mean_prompt_length", "mean_max_new_tokens", "num_clients"]
 
 
 def parse_args(
@@ -30,7 +29,7 @@ def parse_args(
     )
     server_parser.add_argument("--num_replicas", type=int, nargs="+", default=[1])
     server_parser.add_argument(
-        "--cmd", type=str, choices=["start", "stop", "restart"], default="start"
+        "cmd", type=str, nargs="?", choices=["start", "stop", "restart"]
     )
 
     # Client args
@@ -41,8 +40,13 @@ def parse_args(
     client_parser.add_argument(
         "--mean_max_new_tokens", type=int, nargs="+", default=[60]
     )
-    client_parser.add_argument("--num_clients", type=int, nargs="+", default=[2])
-    client_parser.add_argument("--num_requests", type=int, nargs="+", default=[512])
+    client_parser.add_argument(
+        "--num_clients",
+        type=int,
+        nargs="+",
+        default=[1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
+    )
+    client_parser.add_argument("--num_requests", type=int, default=512)
     client_parser.add_argument("--max_prompt_length", type=int, default=4000)
     client_parser.add_argument("--prompt_length_var", type=float, default=0.3)
     client_parser.add_argument("--max_new_tokens_var", type=float, default=0.3)
@@ -61,47 +65,42 @@ def parse_args(
     # Common args
     parser = argparse.ArgumentParser(parents=parents)
     parser.add_argument("--model", type=str, default="meta-llama/Llama-2-7b-hf")
-    parser.add_argument("--deployment_name", type=str, default=None)
+    parser.add_argument(
+        "--deployment_name", type=str, default="mii-benchmark-deployment"
+    )
     parser.add_argument("--vllm", action="store_true")
-    parser.add_argument("--use_defaults", action="store_true")
+    parser.add_argument("--no_model_defaults", action="store_true")
 
     # Parse arguments
     args = parser.parse_args()
 
-    if server_args and not client_args:
-        # If running server, make sure only single values were passed for parameters
-        for param in SERVER_PARAMS:
-            if len(getattr(args, param)) > 1:
-                raise ValueError(
-                    f"Cannot specify multiple values for {param} when running server"
-                )
-            setattr(args, param, getattr(args, param)[0])
-
-    if client_args and not server_args:
-        # If running client, make sure only single values were passed for parameters
-        for param in CLIENT_PARAMS:
-            if len(getattr(args, param)) > 1:
-                raise ValueError(
-                    f"Cannot specify multiple values for {param} when running client"
-                )
-            setattr(args, param, getattr(args, param)[0])
-
-    if not (client_args and server_args):
-        # Generate deployment name if not provided
-        if args.deployment_name is None:
-            args.deployment_name = get_deployment_name(
-                model=args.model,
-                tp_size=args.tp_size,
-                max_ragged_batch_size=args.max_ragged_batch_size,
-            )
-
     return args
 
 
-def get_deployment_name(
-    model: str, tp_size: int, max_ragged_batch_size: int, num_replicas: int
-) -> str:
-    return f"{model}-tp{tp_size}-b{max_ragged_batch_size}-r{num_replicas}"
+def get_args_product(
+    args: argparse.Namespace, which: List[str] = None
+) -> Iterator[argparse.Namespace]:
+    if which is None:
+        return copy.deepcopy(args)
+    arg_values_product = itertools.product(*[getattr(args, k) for k in which])
+    for arg_values in arg_values_product:
+        args_copy = copy.deepcopy(args)
+        for k, v in zip(which, arg_values):
+            setattr(args_copy, k, v)
+        yield args_copy
+
+
+def get_results_path(args: argparse.Namespace) -> Path:
+    return Path(
+        f"results/{args.model}",
+        "-tp{args.tp_size}",
+        "-bs{args.max_ragged_batch_size}",
+        "-replicas{args.num_replicas}",
+        "-prompt{args.mean_prompt_length}",
+        "-gen{args.mean_max_new_tokens}",
+        "-clients{args.num_clients}",
+        ".json",
+    )
 
 
 def output_summary(args, response_details):
@@ -118,14 +117,17 @@ def output_summary(args, response_details):
         + f"First token received: {ps.first_token_latency:.3f} s"
     )
 
-    if args.out_json_path is not None:
-        with open(args.out_json_path, "w") as f:
-            args_dict["out_json_path"] = str(
-                args.out_json_path
-            )  # Path is not JSON serializable
-            data = {
-                "args": args_dict,
-                "time": str(datetime.now()),
-                "response_details": [asdict(r) for r in response_details],
-            }
-            json.dump(data, f, indent=2)
+    out_json_path = args.out_json_path
+    if out_json_path is None:
+        out_json_path = get_results_path(args)
+
+    os.makedirs(out_json_path.parent, exist_ok=True)
+
+    with open(out_json_path, "w") as f:
+        args_dict["out_json_path"] = str(out_json_path)  # Path is not JSON serializable
+        data = {
+            "args": args_dict,
+            "time": str(datetime.now()),
+            "response_details": [asdict(r) for r in response_details],
+        }
+        json.dump(data, f, indent=2)

From d0a729341ecc02ac88b82cbd0f9767fb43c7ce9d Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 14:18:51 -0800
Subject: [PATCH 03/14] update plotting script

---
 benchmarks/inference/mii/plot_th_lat.py | 131 +++++++++++++-----------
 benchmarks/inference/mii/utils.py       |  32 +++---
 2 files changed, 89 insertions(+), 74 deletions(-)

diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py
index e99dc5a3e..d7c4f6ccb 100644
--- a/benchmarks/inference/mii/plot_th_lat.py
+++ b/benchmarks/inference/mii/plot_th_lat.py
@@ -3,37 +3,15 @@
 import argparse
 from pathlib import Path
 import numpy as np
-import pdb
 from postprocess_results import read_json, get_summary
+import os
+import re
 
-bs = 768
-    
-tp_sizes_test = {
-    "7b": [1]
-}
-
-tp_sizes_all = {
-    "7b": [1],
-    "70b": [4, 8],
-}
-
-prompt_gen_pairs_test = [
-    (2600, 60)
-]
-
-prompt_gen_pairs_all = [
-    (1200, 60),
-    (1200, 128),
-    (2600, 60),
-    (2600, 128),
-]
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--test", action="store_true")
-    parser.add_argument("--no_vllm", action="store_true")
-    parser.add_argument("--log_dir", type=Path, default=".")
-    parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency")
+    parser.add_argument("--log_dir", type=Path, default="./results")
+    parser.add_argument("--out_dir", type=Path, default="./plots/throughput_latency")
     args = parser.parse_args()
     return args
 
@@ -42,7 +20,7 @@ def extract_values(file_pattern):
     files = glob.glob(file_pattern)
 
     print(f"Found {len(files)}")
-    print('\n'.join(files))
+    print("\n".join(files))
 
     clients = []
     throughputs = []
@@ -50,67 +28,96 @@ def extract_values(file_pattern):
     for f in files:
         prof_args, response_details = read_json(f)
         summary = get_summary(prof_args, response_details)
-        clients.append(prof_args["client_num"])
+        clients.append(prof_args["num_clients"])
         throughputs.append(summary.throughput)
         latencies.append(summary.latency)
 
     return clients, throughputs, latencies
 
 
-def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
-    if not log_dir.exists():
-        print(f"Log directory {log_dir} does not exist")
-        return
-    
-    if not out_dir.exists():
-        out_dir.mkdir(parents=True, exist_ok=True)
+def output_charts(model, tp_size, bs, replicas, prompt, gen, log_dir, out_dir):
+    out_dir.mkdir(parents=True, exist_ok=True)
 
-    mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
-    if not args.no_vllm:
-        vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
+    result_file_pattern = f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}-clients*.json"
+    mii_file_pattern = f"{log_dir}/fastgen/{result_file_pattern}"
+    vllm_file_pattern = f"{log_dir}/vllm/{result_file_pattern}"
 
     _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
-    if not args.no_vllm:
-        _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
+    _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
 
     # Plotting the scatter plot
     plt.figure(figsize=(6, 4))
 
-    if not args.no_vllm:
-        plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
+    if len(vllm_throughputs) > 0:
+        plt.scatter(
+            vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange"
+        )
         fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
         vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
         vllm_model_fn = np.poly1d(vllm_vllm_model)
-        plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")
-
-    plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue")
+        plt.plot(
+            fit_vllm_x_list,
+            vllm_model_fn(fit_vllm_x_list),
+            color="orange",
+            alpha=0.5,
+            linestyle="--",
+        )
+
+    plt.scatter(
+        mii_throughputs,
+        mii_latencies,
+        label=f"DeepSpeed FastGen",
+        marker="o",
+        color="blue",
+    )
     fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01)
     mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3)
     mii_model_fn = np.poly1d(mii_fit_model)
-    plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color="blue", alpha=0.5, linestyle="--")
-
-    plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}')
-    plt.xlabel('Throughput (queries/s)', fontsize=14)
-    plt.ylabel('Latency', fontsize=14)
+    plt.plot(
+        fit_mii_x_list,
+        mii_model_fn(fit_mii_x_list),
+        color="blue",
+        alpha=0.5,
+        linestyle="--",
+    )
+
+    plt.title(f"Model {model}, Prompt: {prompt}, Generation: {gen}, TP: {tp_size}")
+    plt.xlabel("Throughput (queries/s)", fontsize=14)
+    plt.ylabel("Latency", fontsize=14)
     plt.legend()
     plt.grid(True)
     plt.tight_layout()
-    out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png"
+    out_file = (
+        out_dir
+        / f"{model}-tp{tp_size}-bs{bs}-replicas{replicas}-prompt{prompt}-gen{gen}.png"
+    )
     print(f"Saving {out_file}")
     plt.savefig(out_file)
 
 
 if __name__ == "__main__":
     args = get_args()
-    if args.test:
-        tp_sizes = tp_sizes_test
-        prompt_gen_pairs = prompt_gen_pairs_test
-    else:
-        tp_sizes = tp_sizes_all
-        prompt_gen_pairs = prompt_gen_pairs_test_all
-
-    for model_size, tps in tp_sizes.items():
-        for tp in tps:
-            for prompt, gen in prompt_gen_pairs:
-                output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir)
 
+    if not args.log_dir.exists():
+        raise ValueError(f"Log dir {args.log_dir} does not exist")
+
+    result_params = set()
+    result_re = re.compile(
+        r"(.+)-tp(\d+)-bs(\d+)-replicas(\d+)-prompt(\d+)-gen(\d+)-clients.*.json"
+    )
+    for f in os.listdir(os.path.join(args.log_dir, "fastgen")):
+        match = result_re.match(f)
+        if match:
+            result_params.add(match.groups())
+
+    for model, tp_size, bs, replicas, prompt, gen in result_params:
+        output_charts(
+            model=model,
+            tp_size=tp_size,
+            bs=bs,
+            replicas=replicas,
+            prompt=prompt,
+            gen=gen,
+            log_dir=args.log_dir,
+            out_dir=args.out_dir,
+        )
diff --git a/benchmarks/inference/mii/utils.py b/benchmarks/inference/mii/utils.py
index d1c638435..82af0ef97 100644
--- a/benchmarks/inference/mii/utils.py
+++ b/benchmarks/inference/mii/utils.py
@@ -53,7 +53,7 @@ def parse_args(
     client_parser.add_argument("--warmup", type=int, default=1)
     client_parser.add_argument("--use_thread", action="store_true")
     client_parser.add_argument("--stream", action="store_true")
-    client_parser.add_argument("--out_json_path", type=Path, default=None)
+    client_parser.add_argument("--out_json_dir", type=Path, default="./results/")
 
     # Create the parser, inheriting from the server and/or client parsers
     parents = []
@@ -91,15 +91,25 @@ def get_args_product(
 
 
 def get_results_path(args: argparse.Namespace) -> Path:
+    if args.vllm:
+        lib_path = "vllm"
+    else:
+        lib_path = "fastgen"
     return Path(
-        f"results/{args.model}",
-        "-tp{args.tp_size}",
-        "-bs{args.max_ragged_batch_size}",
-        "-replicas{args.num_replicas}",
-        "-prompt{args.mean_prompt_length}",
-        "-gen{args.mean_max_new_tokens}",
-        "-clients{args.num_clients}",
-        ".json",
+        args.out_json_dir,
+        f"{lib_path}/",
+        "-".join(
+            (
+                args.model.replace("/", "_"),
+                f"tp{args.tp_size}",
+                f"bs{args.max_ragged_batch_size}",
+                f"replicas{args.num_replicas}",
+                f"prompt{args.mean_prompt_length}",
+                f"gen{args.mean_max_new_tokens}",
+                f"clients{args.num_clients}",
+            )
+        )
+        + ".json",
     )
 
 
@@ -117,9 +127,7 @@ def output_summary(args, response_details):
         + f"First token received: {ps.first_token_latency:.3f} s"
     )
 
-    out_json_path = args.out_json_path
-    if out_json_path is None:
-        out_json_path = get_results_path(args)
+    out_json_path = get_results_path(args)
 
     os.makedirs(out_json_path.parent, exist_ok=True)
 

From a1eb29d263a0615f1e7c05c0ae668b7014d68313 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 14:21:18 -0800
Subject: [PATCH 04/14] update example script

---
 benchmarks/inference/mii/run_example.sh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index d4eff5494..41bb23f3c 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -1,24 +1,23 @@
 ### Run the server
+echo "Starting the server"
 python server.py \
         --model meta-llama/Llama-2-7b-hf \
-        --deployment_name llama2-7b-tp1-b768 \
         --tp_size 1 \
         --max_ragged_batch_size 768 \
         start
 
 ### This command will run the client with 60 generation steps and input prompt length of 2600
+echo "Running the client"
 python client.py \
-        --deployment_name llama2-7b-tp1-b768 \
         --mean_prompt_length 2600 \
         --mean_max_new_tokens 60
 
 ### Stop the server
-echo "Stopping server"
-python server.py -d ${DEPLOYMENT_NAME} stop
+echo "Stopping the server"
+python server.py stop
 sleep 120
 
 ### Gernerate the plots
-python plot_th_lat.py --log_dir . --test --no_vllm
-python plot_effective_throughput.py --log_dir . --test --no_vllm
+python plot_th_lat.py
 
-echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768"
+echo "Find figures in ./plots/ and log outputs in ./results/"

From ee41b2f06f8451846079f617b3c276058b605b8c Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 14:29:16 -0800
Subject: [PATCH 05/14] move python files to src/ dir

---
 benchmarks/inference/mii/run_all.sh           |   8 +-
 benchmarks/inference/mii/run_all_replica.sh   |  25 ---
 benchmarks/inference/mii/run_example.sh       |   8 +-
 .../inference/mii/{ => src}/__init__.py       |   0
 benchmarks/inference/mii/{ => src}/client.py  |   0
 .../inference/mii/{ => src}/model_defaults.py |   1 +
 .../{ => src}/plot_effective_throughput.py    | 162 +++++++++++++-----
 .../mii/{ => src}/plot_latency_percentile.py  |  38 ++--
 .../mii/{ => src}/plot_repl_scale.py          |  22 ++-
 .../inference/mii/{ => src}/plot_th_lat.py    |   0
 .../inference/mii/{ => src}/plot_tp_sizes.py  |  37 ++--
 .../mii/{ => src}/postprocess_results.py      |   0
 .../mii/{ => src}/random_query_generator.py   |   9 +-
 .../inference/mii/{ => src}/run_benchmark.py  |   0
 .../inference/mii/{ => src}/sample_input.py   |   5 +-
 benchmarks/inference/mii/{ => src}/server.py  |   0
 benchmarks/inference/mii/{ => src}/utils.py   |   2 +-
 17 files changed, 199 insertions(+), 118 deletions(-)
 delete mode 100644 benchmarks/inference/mii/run_all_replica.sh
 rename benchmarks/inference/mii/{ => src}/__init__.py (100%)
 rename benchmarks/inference/mii/{ => src}/client.py (100%)
 rename benchmarks/inference/mii/{ => src}/model_defaults.py (97%)
 rename benchmarks/inference/mii/{ => src}/plot_effective_throughput.py (55%)
 rename benchmarks/inference/mii/{ => src}/plot_latency_percentile.py (77%)
 rename benchmarks/inference/mii/{ => src}/plot_repl_scale.py (87%)
 rename benchmarks/inference/mii/{ => src}/plot_th_lat.py (100%)
 rename benchmarks/inference/mii/{ => src}/plot_tp_sizes.py (78%)
 rename benchmarks/inference/mii/{ => src}/postprocess_results.py (100%)
 rename benchmarks/inference/mii/{ => src}/random_query_generator.py (73%)
 rename benchmarks/inference/mii/{ => src}/run_benchmark.py (100%)
 rename benchmarks/inference/mii/{ => src}/sample_input.py (99%)
 rename benchmarks/inference/mii/{ => src}/server.py (100%)
 rename benchmarks/inference/mii/{ => src}/utils.py (98%)

diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
index 9048c50a9..9243b6b5a 100644
--- a/benchmarks/inference/mii/run_all.sh
+++ b/benchmarks/inference/mii/run_all.sh
@@ -1,10 +1,10 @@
 MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
 
 for MODEL in ${MODELS[@]}; do
-    python run_benchmark.py --model ${MODEL} --stream
-    python run_benchmark.py --model ${MODEL} --stream --vllm
+    python ./src/run_benchmark.py --model ${MODEL} --stream
+    python ./src/run_benchmark.py --model ${MODEL} --stream --vllm
 done
 
 # Extra runs for Mixtral with non-default settings
-python run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024
-python run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
+python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024
+python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
diff --git a/benchmarks/inference/mii/run_all_replica.sh b/benchmarks/inference/mii/run_all_replica.sh
deleted file mode 100644
index b3fba0408..000000000
--- a/benchmarks/inference/mii/run_all_replica.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-RAGGED_BATCH_SIZE=768
-PARAM_SIZES=(7b)
-REPLICA_NUMS=(1)
-
-declare -A TP_SIZES
-TP_SIZES["7b"]="4"
-TP_SIZES["13b"]="1"
-TP_SIZES["70b"]="4"
-
-for PARAM_SIZE in ${PARAM_SIZES[@]}; do
-    IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
-    for TP in ${TP_VALUES[@]}; do
-        for REPL in ${REPLICA_NUMS[@]}; do
-            DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}_repl${REPL}
-            python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -r ${REPL} -b ${RAGGED_BATCH_SIZE} start
-
-            REQUEST_NUM=$((256 * ${REPL}))
-            DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 CLIENT_NUMS=$((16 * ${REPL})) REQUEST_NUM=$((256 * ${REPL})) bash ./run_bench_client_num.sh
-
-            echo "Stopping server"
-            python server.py -d ${DEPLOYMENT_NAME} stop
-            sleep 120
-        done
-    done
-done
diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index 41bb23f3c..42a63efd2 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -1,6 +1,6 @@
 ### Run the server
 echo "Starting the server"
-python server.py \
+python ./src/server.py \
         --model meta-llama/Llama-2-7b-hf \
         --tp_size 1 \
         --max_ragged_batch_size 768 \
@@ -8,16 +8,16 @@ python server.py \
 
 ### This command will run the client with 60 generation steps and input prompt length of 2600
 echo "Running the client"
-python client.py \
+python ./src/client.py \
         --mean_prompt_length 2600 \
         --mean_max_new_tokens 60
 
 ### Stop the server
 echo "Stopping the server"
-python server.py stop
+python ./src/server.py stop
 sleep 120
 
 ### Gernerate the plots
-python plot_th_lat.py
+python ./src/plot_th_lat.py
 
 echo "Find figures in ./plots/ and log outputs in ./results/"
diff --git a/benchmarks/inference/mii/__init__.py b/benchmarks/inference/mii/src/__init__.py
similarity index 100%
rename from benchmarks/inference/mii/__init__.py
rename to benchmarks/inference/mii/src/__init__.py
diff --git a/benchmarks/inference/mii/client.py b/benchmarks/inference/mii/src/client.py
similarity index 100%
rename from benchmarks/inference/mii/client.py
rename to benchmarks/inference/mii/src/client.py
diff --git a/benchmarks/inference/mii/model_defaults.py b/benchmarks/inference/mii/src/model_defaults.py
similarity index 97%
rename from benchmarks/inference/mii/model_defaults.py
rename to benchmarks/inference/mii/src/model_defaults.py
index d201d3f2a..c22169fc0 100644
--- a/benchmarks/inference/mii/model_defaults.py
+++ b/benchmarks/inference/mii/src/model_defaults.py
@@ -4,6 +4,7 @@
         "mean_prompt_length": (1200, 2600),
         "mean_max_new_tokens": (60, 128),
         "tp_size": (1,),
+        "num_replicas": (1, 8),
     },
     "meta-llama/Llama-13b-hf": {
         "max_prompt_length": 4000,
diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py
similarity index 55%
rename from benchmarks/inference/mii/plot_effective_throughput.py
rename to benchmarks/inference/mii/src/plot_effective_throughput.py
index 350c269c3..5a6da8422 100644
--- a/benchmarks/inference/mii/plot_effective_throughput.py
+++ b/benchmarks/inference/mii/src/plot_effective_throughput.py
@@ -12,14 +12,9 @@
 SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8]
 EMA_SPAN = 16
 
-tp_sizes_all = {
-    "7b": [1],
-    "70b": [4, 8]
-}
+tp_sizes_all = {"7b": [1], "70b": [4, 8]}
 
-tp_sizes_test = {
-    "7b": [1]
-}
+tp_sizes_test = {"7b": [1]}
 
 prompt_gen_pairs_all = [
     (1200, 60),
@@ -28,9 +23,8 @@
     (2600, 128),
 ]
 
-prompt_gen_pairs_test = [
-    (2600, 60)
-]
+prompt_gen_pairs_test = [(2600, 60)]
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -43,23 +37,54 @@ def get_args():
 
 
 def check_token_latency_step(response_details, token_index):
-    P50_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 50)
-    P90_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 90)
-    P99_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 99)
+    P50_token_latency = np.percentile(
+        [
+            r.token_gen_time[token_index]
+            for r in response_details
+            if len(r.token_gen_time) > token_index
+        ],
+        50,
+    )
+    P90_token_latency = np.percentile(
+        [
+            r.token_gen_time[token_index]
+            for r in response_details
+            if len(r.token_gen_time) > token_index
+        ],
+        90,
+    )
+    P99_token_latency = np.percentile(
+        [
+            r.token_gen_time[token_index]
+            for r in response_details
+            if len(r.token_gen_time) > token_index
+        ],
+        99,
+    )
 
     return P50_token_latency, P90_token_latency, P99_token_latency
 
 
 def validate_token_cum_latency_SLA(response_detail, sla_token_gen):
     cumsum_latencies = np.cumsum(np.array(response_detail.token_gen_time[1:]))
-    return all([cumsum_latencies[i] <= (1 / sla_token_gen) * (i + 1) for i in range(len(cumsum_latencies))])
+    return all(
+        [
+            cumsum_latencies[i] <= (1 / sla_token_gen) * (i + 1)
+            for i in range(len(cumsum_latencies))
+        ]
+    )
 
 
 def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span):
-    ema_latency = pd.Series(response_detail.token_gen_time[1:]).ewm(span=ema_span).mean().values.tolist()
-    return all([t < 1. / sla_token_gen for t in ema_latency])
+    ema_latency = (
+        pd.Series(response_detail.token_gen_time[1:])
+        .ewm(span=ema_span)
+        .mean()
+        .values.tolist()
+    )
+    return all([t < 1.0 / sla_token_gen for t in ema_latency])
+
 
-                        
 def validate_prompt_latency_SLA(response_detail, sla_token_gen, f):
     tokenizer = get_tokenizer()
     prompt_length = len(tokenizer.tokenize(response_detail.prompt))
@@ -71,14 +96,14 @@ def validate_prompt_latency_SLA(response_detail, sla_token_gen, f):
         return True
 
     return f[0](response_detail, sla_token_gen, *f[1])
-    
+
 
 def calc_throughput(response_details):
     start_time = min([r.start_time for r in response_details])
     end_time = max([r.end_time for r in response_details])
     return len(response_details) / (end_time - start_time)
 
-    
+
 def extract_values(file_pattern, sla_token_gen, validate_func):
     files = glob.glob(file_pattern)
     print(f"Found {len(files)} files")
@@ -87,8 +112,16 @@ def extract_values(file_pattern, sla_token_gen, validate_func):
     for f in files:
         prof_args, response_details = read_json(f)
         client_num = prof_args["client_num"]
-        num_req_ok = len([r for r in response_details if validate_prompt_latency_SLA(r, sla_token_gen, validate_func)])
-        goodputs[client_num] = calc_throughput(response_details) * (num_req_ok / len(response_details))
+        num_req_ok = len(
+            [
+                r
+                for r in response_details
+                if validate_prompt_latency_SLA(r, sla_token_gen, validate_func)
+            ]
+        )
+        goodputs[client_num] = calc_throughput(response_details) * (
+            num_req_ok / len(response_details)
+        )
         good_ratios[client_num] = num_req_ok / len(response_details)
 
     return goodputs, good_ratios
@@ -98,11 +131,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
     if not log_dir.exists():
         print(f"Log directory {log_dir} does not exist")
         return
-    
+
     if not out_dir.exists():
         out_dir.mkdir(parents=True, exist_ok=True)
-    
-    print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}")
+
+    print(
+        f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}"
+    )
 
     mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
     if not args.no_vllm:
@@ -110,54 +145,89 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
 
     validate_funcs = [
         (validate_token_cum_latency_SLA, (), "cum"),
-        (validate_token_ema_latency_SLA, (EMA_SPAN, ), f"ema{EMA_SPAN}"),
+        (validate_token_ema_latency_SLA, (EMA_SPAN,), f"ema{EMA_SPAN}"),
     ]
 
     for f in validate_funcs:
-    
-        mii_goodputs, mii_good_ratios = extract_values(mii_file_pattern, sla_token_gen, f)
+
+        mii_goodputs, mii_good_ratios = extract_values(
+            mii_file_pattern, sla_token_gen, f
+        )
         client_num_list = sorted(list(mii_goodputs.keys()))
         mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list]
 
         if not args.no_vllm:
-            vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
-            vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]
+            vllm_goodputs, vllm_good_ratios = extract_values(
+                vllm_file_pattern, sla_token_gen, f
+            )
+            vllm_goodputs_list = [
+                vllm_goodputs[client_num] for client_num in client_num_list
+            ]
 
         # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}")
         # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}")
 
         # Plotting the scatter plot
         plt.figure(figsize=(7, 4))
-        plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue")
+        plt.scatter(
+            client_num_list,
+            mii_goodputs_list,
+            label=f"DeepSpeed-FastGen",
+            marker="o",
+            color="blue",
+        )
         if not args.no_vllm:
-            plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")
+            plt.scatter(
+                client_num_list,
+                vllm_goodputs_list,
+                label=f"vLLM",
+                marker="x",
+                color="orange",
+            )
 
         fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1)
         mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4)
         mii_model_fn = np.poly1d(mii_fit_model)
-        plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--")
+        plt.plot(
+            fit_x_list,
+            mii_model_fn(fit_x_list),
+            color="blue",
+            alpha=0.5,
+            linestyle="--",
+        )
 
         if not args.no_vllm:
             vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
             vllm_model_fn = np.poly1d(vllm_fit_model)
-            plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")
-
-        title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \
-                + f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}'
+            plt.plot(
+                fit_x_list,
+                vllm_model_fn(fit_x_list),
+                color="orange",
+                alpha=0.5,
+                linestyle="--",
+            )
+
+        title = (
+            f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n"
+            + f"Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}"
+        )
         plt.title(title, fontsize=10)
-        plt.xlabel('Number of clients', fontsize=10)
-        plt.ylabel('Effective throughput (queries/s)', fontsize=10)
+        plt.xlabel("Number of clients", fontsize=10)
+        plt.ylabel("Effective throughput (queries/s)", fontsize=10)
         # plt.rcParams['figure.subplot.bottom'] = 0.30
         plt.ylim(bottom=-0.05)
         plt.legend()
         plt.grid(True)
         # plt.show()
-        out_file = out_dir / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png"
+        out_file = (
+            out_dir
+            / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png"
+        )
         plt.savefig(out_file)
         plt.clf()
         print(f"Saved {out_file}")
 
-    
+
 if __name__ == "__main__":
     args = get_args()
 
@@ -172,5 +242,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
         for tp in tps:
             for prompt, gen in prompt_gen_pairs:
                 for sla_token_gen in SLA_GEN_TOKENS_PER_SEC:
-                    display_results(model_size, tp, RAGGED_BATCH_SIZE, sla_token_gen, prompt, gen, args.log_dir, args.out_dir)
-
+                    display_results(
+                        model_size,
+                        tp,
+                        RAGGED_BATCH_SIZE,
+                        sla_token_gen,
+                        prompt,
+                        gen,
+                        args.log_dir,
+                        args.out_dir,
+                    )
diff --git a/benchmarks/inference/mii/plot_latency_percentile.py b/benchmarks/inference/mii/src/plot_latency_percentile.py
similarity index 77%
rename from benchmarks/inference/mii/plot_latency_percentile.py
rename to benchmarks/inference/mii/src/plot_latency_percentile.py
index c91c78bf1..cf4f911c9 100644
--- a/benchmarks/inference/mii/plot_latency_percentile.py
+++ b/benchmarks/inference/mii/src/plot_latency_percentile.py
@@ -10,7 +10,7 @@
 bs = 768
 SKIP_HEAD_TOKEN_NUM = 2
 SKIP_REQUEST_NUM = 100
-    
+
 tp_sizes = {
     "70b": [4],
 }
@@ -23,14 +23,16 @@
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--log_dir", type=Path, default=".")
-    parser.add_argument("--out_dir", type=Path, default="charts/percentile_token_latency")
+    parser.add_argument(
+        "--out_dir", type=Path, default="charts/percentile_token_latency"
+    )
     args = parser.parse_args()
     return args
 
 
 def extract_values(file_pattern):
     files = glob.glob(file_pattern)
-    
+
     latencies = {}
     for f in files:
         prof_args, response_details = read_json(f)
@@ -38,18 +40,20 @@ def extract_values(file_pattern):
 
         response_details.sort(key=lambda r: r.start_time)
         response_details = response_details[SKIP_REQUEST_NUM:-SKIP_REQUEST_NUM]
-        token_latencies = [r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details]
+        token_latencies = [
+            r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details
+        ]
 
         flat_latency_list = list(itertools.chain(*token_latencies))
         latencies[client_num] = flat_latency_list
     return latencies
 
 
-def output_charts(model_size, tp, bs,  prompt, gen, log_dir, out_dir):
+def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
     if not log_dir.exists():
         print(f"Log directory {log_dir} does not exist")
         return
-    
+
     if not out_dir.exists():
         out_dir.mkdir(parents=True, exist_ok=True)
 
@@ -79,7 +83,10 @@ def output_charts(model_size, tp, bs,  prompt, gen, log_dir, out_dir):
         # print(f"P95_vllm_val={P95_vllm_val}")
         # print(f"P95_mii_val={P95_mii_val}")
 
-        out_file = out_dir / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png"
+        out_file = (
+            out_dir
+            / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png"
+        )
 
         x1 = [1, 2, 3]
         y1 = [P50_vllm_val, P90_vllm_val, P95_vllm_val]
@@ -87,11 +94,13 @@ def output_charts(model_size, tp, bs,  prompt, gen, log_dir, out_dir):
         x2 = [1.3, 2.3, 3.3]
         y2 = [P50_mii_val, P90_mii_val, P95_mii_val]
 
-        label_x = ['P50', 'P90', 'P95']
+        label_x = ["P50", "P90", "P95"]
 
-        plt.bar(x1, y1, width=0.3, label='vLLM', align="center", color="orange")
-        plt.bar(x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue")
-        plt.ylabel('Latency', fontsize=14)
+        plt.bar(x1, y1, width=0.3, label="vLLM", align="center", color="orange")
+        plt.bar(
+            x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue"
+        )
+        plt.ylabel("Latency", fontsize=14)
         plt.legend(loc=2)
 
         plt.xticks([1.15, 2.15, 3.15], label_x)
@@ -102,9 +111,10 @@ def output_charts(model_size, tp, bs,  prompt, gen, log_dir, out_dir):
 
 if __name__ == "__main__":
     args = get_args()
-        
+
     for model_size, tps in tp_sizes.items():
         for tp in tps:
             for prompt, gen in prompt_gen_pairs:
-                output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir)
-
+                output_charts(
+                    model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir
+                )
diff --git a/benchmarks/inference/mii/plot_repl_scale.py b/benchmarks/inference/mii/src/plot_repl_scale.py
similarity index 87%
rename from benchmarks/inference/mii/plot_repl_scale.py
rename to benchmarks/inference/mii/src/plot_repl_scale.py
index 394c54588..4c96d695f 100644
--- a/benchmarks/inference/mii/plot_repl_scale.py
+++ b/benchmarks/inference/mii/src/plot_repl_scale.py
@@ -18,6 +18,7 @@
     (2600, 60),
 ]
 
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--log_dir", type=Path, default=".")
@@ -46,7 +47,7 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
     if not log_dir.exists():
         print(f"Log directory {log_dir} does not exist")
         return
-    
+
     if not out_dir.exists():
         out_dir.mkdir(parents=True, exist_ok=True)
 
@@ -67,17 +68,19 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
 
         # Plotting the scatter plot
         plt.figure(figsize=(6, 4))
-        
+
         plt.bar(REPLICA_NUMS, throughputs[c], color="blue", alpha=0.9)
 
         fit_x_list = np.arange(min(REPLICA_NUMS), max(REPLICA_NUMS), 0.1)
         mii_fit_model = np.polyfit(REPLICA_NUMS, throughputs[c], 1)
         mii_model_fn = np.poly1d(mii_fit_model)
         plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", linestyle="--")
-        
-        plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}')
-        plt.xlabel('Number of replicas', fontsize=14)
-        plt.ylabel('Throughput (queries/s)', fontsize=14)
+
+        plt.title(
+            f"Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}"
+        )
+        plt.xlabel("Number of replicas", fontsize=14)
+        plt.ylabel("Throughput (queries/s)", fontsize=14)
         plt.grid(True)
         plt.tight_layout()
         # plt.show()
@@ -87,9 +90,10 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
 
 if __name__ == "__main__":
     args = get_args()
-        
+
     for model_size, tps in tp_sizes.items():
         for tp in tps:
             for prompt, gen in prompt_gen_pairs:
-                output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir)
-
+                output_charts(
+                    model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir
+                )
diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py
similarity index 100%
rename from benchmarks/inference/mii/plot_th_lat.py
rename to benchmarks/inference/mii/src/plot_th_lat.py
diff --git a/benchmarks/inference/mii/plot_tp_sizes.py b/benchmarks/inference/mii/src/plot_tp_sizes.py
similarity index 78%
rename from benchmarks/inference/mii/plot_tp_sizes.py
rename to benchmarks/inference/mii/src/plot_tp_sizes.py
index 546310258..2743e1955 100644
--- a/benchmarks/inference/mii/plot_tp_sizes.py
+++ b/benchmarks/inference/mii/src/plot_tp_sizes.py
@@ -7,7 +7,7 @@
 from postprocess_results import read_json, get_summary
 
 bs = 768
-    
+
 tp_sizes = {
     # "7b": [1],
     "13b": [1, 2, 4],
@@ -22,6 +22,7 @@
     (2600, 256),
 ]
 
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--log_dir", type=Path, default="logs.release")
@@ -34,7 +35,7 @@ def extract_values(file_pattern):
     files = glob.glob(file_pattern)
 
     print(f"Found {len(files)}")
-    print('\n'.join(files))
+    print("\n".join(files))
 
     clients = []
     throughputs = []
@@ -53,7 +54,7 @@ def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir):
     if not log_dir.exists():
         print(f"Log directory {log_dir} does not exist")
         return
-    
+
     if not out_dir.exists():
         out_dir.mkdir(parents=True, exist_ok=True)
 
@@ -73,26 +74,38 @@ def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir):
         tflops_per_query = n_params * (prompt + gen) * 2 * 1e-3
         mii_tflops = [th * tflops_per_query / tp for th in mii_throughputs]
 
-        plt.scatter(mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color)
+        plt.scatter(
+            mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color
+        )
         fit_mii_x_list = np.arange(min(mii_tflops), max(mii_tflops), 0.01)
         mii_fit_model = np.polyfit(mii_tflops, mii_latencies, 3)
         mii_model_fn = np.poly1d(mii_fit_model)
-        plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color=color, alpha=0.5, linestyle="--")
-
-    plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}')
-    plt.xlabel('TFLOPs (per GPU)', fontsize=14)
-    plt.ylabel('Latency', fontsize=14)
+        plt.plot(
+            fit_mii_x_list,
+            mii_model_fn(fit_mii_x_list),
+            color=color,
+            alpha=0.5,
+            linestyle="--",
+        )
+
+    plt.title(
+        f"Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}"
+    )
+    plt.xlabel("TFLOPs (per GPU)", fontsize=14)
+    plt.ylabel("Latency", fontsize=14)
     plt.legend()
     plt.grid(True)
     # plt.show()
-    out_file = out_dir / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png"
+    out_file = (
+        out_dir
+        / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png"
+    )
     plt.savefig(out_file)
 
 
 if __name__ == "__main__":
     args = get_args()
-        
+
     for model_size, tps in tp_sizes.items():
         for prompt, gen in prompt_gen_pairs:
             output_charts(model_size, tps, bs, prompt, gen, args.log_dir, args.out_dir)
-
diff --git a/benchmarks/inference/mii/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py
similarity index 100%
rename from benchmarks/inference/mii/postprocess_results.py
rename to benchmarks/inference/mii/src/postprocess_results.py
diff --git a/benchmarks/inference/mii/random_query_generator.py b/benchmarks/inference/mii/src/random_query_generator.py
similarity index 73%
rename from benchmarks/inference/mii/random_query_generator.py
rename to benchmarks/inference/mii/src/random_query_generator.py
index b8442af4f..8445af156 100644
--- a/benchmarks/inference/mii/random_query_generator.py
+++ b/benchmarks/inference/mii/src/random_query_generator.py
@@ -3,6 +3,7 @@
 import numpy as np
 import time
 
+
 class RandomQueryGenerator:
     def __init__(self, input_text, tokenizer, seed):
         self.input_text = input_text
@@ -14,9 +15,9 @@ def __init__(self, input_text, tokenizer, seed):
 
     def get_random_request_text(self, length, variance, max_length, batch):
         request_text = []
-        tokenized_input = self.tokenizer.batch_encode_plus([self.input_text],
-                                                        return_tensors="pt",
-                                                        padding=False)
+        tokenized_input = self.tokenizer.batch_encode_plus(
+            [self.input_text], return_tensors="pt", padding=False
+        )
         offset = list(range(512))
         random.shuffle(offset)
 
@@ -25,6 +26,6 @@ def get_random_request_text(self, length, variance, max_length, batch):
             # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens
             req_prompt_length = min(int(np.random.normal(length, variance)), max_length)
 
-            text = self.tokenizer.decode(text_ids[i:req_prompt_length+i])
+            text = self.tokenizer.decode(text_ids[i : req_prompt_length + i])
             request_text.append(text)
         return request_text
diff --git a/benchmarks/inference/mii/run_benchmark.py b/benchmarks/inference/mii/src/run_benchmark.py
similarity index 100%
rename from benchmarks/inference/mii/run_benchmark.py
rename to benchmarks/inference/mii/src/run_benchmark.py
diff --git a/benchmarks/inference/mii/sample_input.py b/benchmarks/inference/mii/src/sample_input.py
similarity index 99%
rename from benchmarks/inference/mii/sample_input.py
rename to benchmarks/inference/mii/src/sample_input.py
index 77d02af5f..72552387b 100644
--- a/benchmarks/inference/mii/sample_input.py
+++ b/benchmarks/inference/mii/src/sample_input.py
@@ -1,8 +1,7 @@
-
 # This is a sample input consisting of:
 # Code & Text
 
-all_text = '''Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output.
+all_text = """Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output.
               During training, the neural network learns to make accurate predictions by adjusting its internal parameters. This adjustment is done using an optimization algorithm called gradient descent. Gradient descent calculates the gradients of a loss function, which measures the discrepancy between the predicted output of the network and the desired output. These gradients indicate the direction and magnitude of parameter updates that will minimize the loss.
               The learning rate is an important hyperparameter in gradient descent. It determines the step size taken during parameter updates. A higher learning rate can lead to faster convergence, but it risks overshooting the optimal solution. On the other hand, a lower learning rate may converge more slowly, but it can result in more precise updates.
               Activation functions are applied to the output of each neuron in a neural network. They introduce non-linearities, enabling the network to learn complex patterns and relationships in the data. Popular activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh).
@@ -218,4 +217,4 @@ def top_p_sampling(self, logits, p=0.9):
                     print("Top-k Sampling:", top_k_text)
                     print("Top-p Sampling:", top_p_text)
                     Make sure to adjust the server_url with the appropriate URL of your HTTP server, and ensure that the server is running and accessible before making requests through the API.
-           '''
\ No newline at end of file
+           """
diff --git a/benchmarks/inference/mii/server.py b/benchmarks/inference/mii/src/server.py
similarity index 100%
rename from benchmarks/inference/mii/server.py
rename to benchmarks/inference/mii/src/server.py
diff --git a/benchmarks/inference/mii/utils.py b/benchmarks/inference/mii/src/utils.py
similarity index 98%
rename from benchmarks/inference/mii/utils.py
rename to benchmarks/inference/mii/src/utils.py
index 82af0ef97..a6a36213b 100644
--- a/benchmarks/inference/mii/utils.py
+++ b/benchmarks/inference/mii/src/utils.py
@@ -132,7 +132,7 @@ def output_summary(args, response_details):
     os.makedirs(out_json_path.parent, exist_ok=True)
 
     with open(out_json_path, "w") as f:
-        args_dict["out_json_path"] = str(out_json_path)  # Path is not JSON serializable
+        args_dict["out_json_dir"] = str(out_json_path)  # Path is not JSON serializable
         data = {
             "args": args_dict,
             "time": str(datetime.now()),

From 9eab5af28317e1dbb636e1c8d02c47827ae6593e Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 15:23:39 -0800
Subject: [PATCH 06/14] update example, small changes

---
 benchmarks/inference/mii/run_example.sh       | 21 ++++++-------------
 benchmarks/inference/mii/src/client.py        |  7 ++-----
 .../mii/src/plot_effective_throughput.py      |  1 +
 .../mii/src/plot_latency_percentile.py        |  1 +
 .../inference/mii/src/plot_repl_scale.py      |  1 +
 benchmarks/inference/mii/src/run_benchmark.py |  6 ++++--
 benchmarks/inference/mii/src/utils.py         |  9 ++++----
 7 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index 42a63efd2..cdc0b6f8e 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -1,23 +1,14 @@
-### Run the server
-echo "Starting the server"
-python ./src/server.py \
+# Run benchmark
+python ./src/run_benchmark.py \
         --model meta-llama/Llama-2-7b-hf \
         --tp_size 1 \
         --max_ragged_batch_size 768 \
-        start
-
-### This command will run the client with 60 generation steps and input prompt length of 2600
-echo "Running the client"
-python ./src/client.py \
         --mean_prompt_length 2600 \
-        --mean_max_new_tokens 60
-
-### Stop the server
-echo "Stopping the server"
-python ./src/server.py stop
-sleep 120
+        --mean_max_new_tokens 60 \
+        --stream \
+        --no_model_defaults
 
 ### Gernerate the plots
 python ./src/plot_th_lat.py
 
-echo "Find figures in ./plots/ and log outputs in ./results/"
+echo "Find figures in ./plots/ and log outputs in ./results/"
\ No newline at end of file
diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py
index cbfb2b582..42a576c51 100644
--- a/benchmarks/inference/mii/src/client.py
+++ b/benchmarks/inference/mii/src/client.py
@@ -17,7 +17,7 @@
 
 from postprocess_results import ResponseDetails
 
-from utils import parse_args, output_summary, get_args_product, CLIENT_PARAMS
+from utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS
 
 
 def call_mii(client, input_tokens, max_new_tokens, stream):
@@ -263,10 +263,7 @@ def run_client(args):
 if __name__ == "__main__":
     args = parse_args(client_args=True)
 
-    if args.out_json_path is not None and not args.out_json_path.parent.exists():
-        raise ValueError(f"Parent directory of {args.out_json_path}")
-
     for client_args in get_args_product(args, which=CLIENT_PARAMS):
         response_details = run_client(client_args)
 
-        output_summary(client_args, response_details)
+        print_summary(client_args, response_details)
diff --git a/benchmarks/inference/mii/src/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py
index 5a6da8422..8e865c696 100644
--- a/benchmarks/inference/mii/src/plot_effective_throughput.py
+++ b/benchmarks/inference/mii/src/plot_effective_throughput.py
@@ -229,6 +229,7 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
 
 
 if __name__ == "__main__":
+    raise NotImplementedError("This script is not up to date")
     args = get_args()
 
     if args.test:
diff --git a/benchmarks/inference/mii/src/plot_latency_percentile.py b/benchmarks/inference/mii/src/plot_latency_percentile.py
index cf4f911c9..058dbccd3 100644
--- a/benchmarks/inference/mii/src/plot_latency_percentile.py
+++ b/benchmarks/inference/mii/src/plot_latency_percentile.py
@@ -110,6 +110,7 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
 
 
 if __name__ == "__main__":
+    raise NotImplementedError("This script is not up to date")
     args = get_args()
 
     for model_size, tps in tp_sizes.items():
diff --git a/benchmarks/inference/mii/src/plot_repl_scale.py b/benchmarks/inference/mii/src/plot_repl_scale.py
index 4c96d695f..b46e2ad87 100644
--- a/benchmarks/inference/mii/src/plot_repl_scale.py
+++ b/benchmarks/inference/mii/src/plot_repl_scale.py
@@ -89,6 +89,7 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
 
 
 if __name__ == "__main__":
+    raise NotImplementedError("This script is not up to date")
     args = get_args()
 
     for model_size, tps in tp_sizes.items():
diff --git a/benchmarks/inference/mii/src/run_benchmark.py b/benchmarks/inference/mii/src/run_benchmark.py
index 66814969e..cf8d525dd 100644
--- a/benchmarks/inference/mii/src/run_benchmark.py
+++ b/benchmarks/inference/mii/src/run_benchmark.py
@@ -1,6 +1,7 @@
 from utils import (
     parse_args,
-    output_summary,
+    print_summary,
+    save_json_results,
     get_args_product,
     SERVER_PARAMS,
     CLIENT_PARAMS,
@@ -27,7 +28,8 @@ def run_benchmark() -> None:
 
         for client_args in get_args_product(server_args, which=CLIENT_PARAMS):
             response_details = run_client(client_args)
-            output_summary(client_args, response_details)
+            print_summary(client_args, response_details)
+            save_json_results(client_args, response_details)
 
         stop_server(server_args)
 
diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py
index a6a36213b..2cbd53720 100644
--- a/benchmarks/inference/mii/src/utils.py
+++ b/benchmarks/inference/mii/src/utils.py
@@ -113,9 +113,8 @@ def get_results_path(args: argparse.Namespace) -> Path:
     )
 
 
-def output_summary(args, response_details):
-    args_dict = vars(args)
-    ps = get_summary(args_dict, response_details)
+def print_summary(args, response_details):
+    ps = get_summary(vars(args), response_details)
     print(
         f"Deployment: {args.deployment_name} Clients: {args.num_clients}, "
         + f"Prompt (mean): {args.mean_prompt_length} tokens, "
@@ -127,8 +126,10 @@ def output_summary(args, response_details):
         + f"First token received: {ps.first_token_latency:.3f} s"
     )
 
-    out_json_path = get_results_path(args)
 
+def save_json_results(args, response_details):
+    args_dict = vars(args)
+    out_json_path = get_results_path(args)
     os.makedirs(out_json_path.parent, exist_ok=True)
 
     with open(out_json_path, "w") as f:

From eaaa1ce70a9a27a7edff148f1216fb6a72b61166 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 15:31:36 -0800
Subject: [PATCH 07/14] add license

---
 benchmarks/inference/mii/run_all.sh           |  7 +++++-
 benchmarks/inference/mii/run_example.sh       |  5 ++++
 benchmarks/inference/mii/src/__init__.py      |  4 +++
 benchmarks/inference/mii/src/client.py        | 25 +++++++++++--------
 .../inference/mii/src/model_defaults.py       |  5 ++++
 .../mii/src/plot_effective_throughput.py      |  5 ++++
 .../mii/src/plot_latency_percentile.py        |  5 ++++
 .../inference/mii/src/plot_repl_scale.py      |  5 ++++
 benchmarks/inference/mii/src/plot_th_lat.py   |  5 ++++
 benchmarks/inference/mii/src/plot_tp_sizes.py |  6 +++++
 .../inference/mii/src/postprocess_results.py  | 13 +++++++---
 .../mii/src/random_query_generator.py         |  8 ++++--
 benchmarks/inference/mii/src/run_benchmark.py | 11 +++++---
 benchmarks/inference/mii/src/sample_input.py  |  5 ++++
 benchmarks/inference/mii/src/server.py        |  1 +
 benchmarks/inference/mii/src/utils.py         | 19 +++++++++-----
 16 files changed, 102 insertions(+), 27 deletions(-)

diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
index 9243b6b5a..67cf80e1f 100644
--- a/benchmarks/inference/mii/run_all.sh
+++ b/benchmarks/inference/mii/run_all.sh
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
 
 for MODEL in ${MODELS[@]}; do
@@ -7,4 +12,4 @@ done
 
 # Extra runs for Mixtral with non-default settings
 python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024
-python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
+python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
\ No newline at end of file
diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index cdc0b6f8e..f995f441b 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 # Run benchmark
 python ./src/run_benchmark.py \
         --model meta-llama/Llama-2-7b-hf \
diff --git a/benchmarks/inference/mii/src/__init__.py b/benchmarks/inference/mii/src/__init__.py
index e69de29bb..208299fb8 100644
--- a/benchmarks/inference/mii/src/__init__.py
+++ b/benchmarks/inference/mii/src/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py
index 42a576c51..190ccd443 100644
--- a/benchmarks/inference/mii/src/client.py
+++ b/benchmarks/inference/mii/src/client.py
@@ -1,22 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import asyncio
+import json
+import multiprocessing
 import os
-import time
-import random
 import queue
-import multiprocessing
+import random
+import requests
 import threading
+import time
 from typing import List, Iterable
-import numpy as np
 
+import numpy as np
 from transformers import AutoTokenizer
-from random_query_generator import RandomQueryGenerator
-from sample_input import all_text
-import time
-import json
-import asyncio
-import requests
 
 from postprocess_results import ResponseDetails
-
+from random_query_generator import RandomQueryGenerator
+from sample_input import all_text
 from utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS
 
 
diff --git a/benchmarks/inference/mii/src/model_defaults.py b/benchmarks/inference/mii/src/model_defaults.py
index c22169fc0..33ba5dfef 100644
--- a/benchmarks/inference/mii/src/model_defaults.py
+++ b/benchmarks/inference/mii/src/model_defaults.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 MODEL_DEFAULTS = {
     "meta-llama/Llama-2-7b-hf": {
         "max_prompt_length": 4000,
diff --git a/benchmarks/inference/mii/src/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py
index 8e865c696..a81308774 100644
--- a/benchmarks/inference/mii/src/plot_effective_throughput.py
+++ b/benchmarks/inference/mii/src/plot_effective_throughput.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import argparse
 from pathlib import Path
 import glob
diff --git a/benchmarks/inference/mii/src/plot_latency_percentile.py b/benchmarks/inference/mii/src/plot_latency_percentile.py
index 058dbccd3..1c1df024b 100644
--- a/benchmarks/inference/mii/src/plot_latency_percentile.py
+++ b/benchmarks/inference/mii/src/plot_latency_percentile.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import argparse
 import glob
 from pathlib import Path
diff --git a/benchmarks/inference/mii/src/plot_repl_scale.py b/benchmarks/inference/mii/src/plot_repl_scale.py
index b46e2ad87..e52eb775e 100644
--- a/benchmarks/inference/mii/src/plot_repl_scale.py
+++ b/benchmarks/inference/mii/src/plot_repl_scale.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import glob
 import matplotlib.pyplot as plt
 import argparse
diff --git a/benchmarks/inference/mii/src/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py
index d7c4f6ccb..87063bd6f 100644
--- a/benchmarks/inference/mii/src/plot_th_lat.py
+++ b/benchmarks/inference/mii/src/plot_th_lat.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import glob
 import matplotlib.pyplot as plt
 import argparse
diff --git a/benchmarks/inference/mii/src/plot_tp_sizes.py b/benchmarks/inference/mii/src/plot_tp_sizes.py
index 2743e1955..15dc06bb8 100644
--- a/benchmarks/inference/mii/src/plot_tp_sizes.py
+++ b/benchmarks/inference/mii/src/plot_tp_sizes.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import glob
 import matplotlib.pyplot as plt
 import argparse
@@ -104,6 +109,7 @@ def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir):
 
 
 if __name__ == "__main__":
+    raise NotImplementedError("This script is not up to date")
     args = get_args()
 
     for model_size, tps in tp_sizes.items():
diff --git a/benchmarks/inference/mii/src/postprocess_results.py b/benchmarks/inference/mii/src/postprocess_results.py
index b898f7c8b..7e25bfddc 100644
--- a/benchmarks/inference/mii/src/postprocess_results.py
+++ b/benchmarks/inference/mii/src/postprocess_results.py
@@ -1,12 +1,17 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import argparse
-from pathlib import Path
 import json
-import numpy as np
-from statistics import mean
-from functools import reduce
 from dataclasses import dataclass
+from functools import reduce
+from pathlib import Path
+from statistics import mean
 from typing import List
 
+import numpy as np
 from transformers import AutoTokenizer
 
 
diff --git a/benchmarks/inference/mii/src/random_query_generator.py b/benchmarks/inference/mii/src/random_query_generator.py
index 8445af156..eca16d8ff 100644
--- a/benchmarks/inference/mii/src/random_query_generator.py
+++ b/benchmarks/inference/mii/src/random_query_generator.py
@@ -1,7 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import numpy as np
 import torch
 import random
-import numpy as np
-import time
 
 
 class RandomQueryGenerator:
diff --git a/benchmarks/inference/mii/src/run_benchmark.py b/benchmarks/inference/mii/src/run_benchmark.py
index cf8d525dd..d9cb590ad 100644
--- a/benchmarks/inference/mii/src/run_benchmark.py
+++ b/benchmarks/inference/mii/src/run_benchmark.py
@@ -1,3 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from client import run_client
+from model_defaults import MODEL_DEFAULTS
+from server import start_server, stop_server
 from utils import (
     parse_args,
     print_summary,
@@ -6,9 +14,6 @@
     SERVER_PARAMS,
     CLIENT_PARAMS,
 )
-from server import start_server, stop_server
-from client import run_client
-from model_defaults import MODEL_DEFAULTS
 
 
 def run_benchmark() -> None:
diff --git a/benchmarks/inference/mii/src/sample_input.py b/benchmarks/inference/mii/src/sample_input.py
index 72552387b..bae18ce62 100644
--- a/benchmarks/inference/mii/src/sample_input.py
+++ b/benchmarks/inference/mii/src/sample_input.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 # This is a sample input consisting of:
 # Code & Text
 
diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py
index 7eb6b7fe2..73ee3c4a6 100644
--- a/benchmarks/inference/mii/src/server.py
+++ b/benchmarks/inference/mii/src/server.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
+
 import subprocess
 import time
 
diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py
index 2cbd53720..892779c1b 100644
--- a/benchmarks/inference/mii/src/utils.py
+++ b/benchmarks/inference/mii/src/utils.py
@@ -1,13 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import argparse
-from pathlib import Path
-from postprocess_results import get_summary, ResponseDetails
+import copy
+import itertools
 import json
-from datetime import datetime
+import os
+
 from dataclasses import asdict
-import itertools
+from datetime import datetime
+from pathlib import Path
 from typing import Iterator, List
-import copy
-import os
+
+from postprocess_results import get_summary, ResponseDetails
 
 # For these arguments, users can provide multiple values when running the
 # benchmark. The benchmark will iterate over all possible combinations.

From dfbe75b10ed6c8c1c77fb60647314e00d780b5d2 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 16:28:46 -0800
Subject: [PATCH 08/14] minor changes to simplify running benchmark

---
 benchmarks/inference/mii/run_example.sh       |   4 +-
 .../src/{model_defaults.py => defaults.py}    |  17 ++-
 benchmarks/inference/mii/src/run_benchmark.py |  21 ++-
 benchmarks/inference/mii/src/utils.py         | 126 +++++++++++++++---
 4 files changed, 130 insertions(+), 38 deletions(-)
 rename benchmarks/inference/mii/src/{model_defaults.py => defaults.py} (82%)

diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index f995f441b..792b9a188 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -7,11 +7,11 @@
 python ./src/run_benchmark.py \
         --model meta-llama/Llama-2-7b-hf \
         --tp_size 1 \
+        --num_replicas 1 \
         --max_ragged_batch_size 768 \
         --mean_prompt_length 2600 \
         --mean_max_new_tokens 60 \
-        --stream \
-        --no_model_defaults
+        --stream
 
 ### Gernerate the plots
 python ./src/plot_th_lat.py
diff --git a/benchmarks/inference/mii/src/model_defaults.py b/benchmarks/inference/mii/src/defaults.py
similarity index 82%
rename from benchmarks/inference/mii/src/model_defaults.py
rename to benchmarks/inference/mii/src/defaults.py
index 33ba5dfef..7ebe6f3a7 100644
--- a/benchmarks/inference/mii/src/model_defaults.py
+++ b/benchmarks/inference/mii/src/defaults.py
@@ -3,12 +3,21 @@
 
 # DeepSpeed Team
 
+ARG_DEFAULTS = {
+    "tp_size": 1,
+    "max_ragged_batch_size": 768,
+    "num_replicas": 1,
+    "max_prompt_length": 4000,
+    "mean_prompt_length": 2600,
+    "mean_max_new_tokens": 60,
+}
+
 MODEL_DEFAULTS = {
     "meta-llama/Llama-2-7b-hf": {
         "max_prompt_length": 4000,
         "mean_prompt_length": (1200, 2600),
         "mean_max_new_tokens": (60, 128),
-        "tp_size": (1,),
+        "tp_size": 1,
         "num_replicas": (1, 8),
     },
     "meta-llama/Llama-13b-hf": {
@@ -33,18 +42,18 @@
         "max_prompt_length": 2000,
         "mean_prompt_length": (1200, 1900),
         "mean_max_new_tokens": (60, 128),
-        "tp_size": (8,),
+        "tp_size": 8,
     },
     "microsoft/phi-2": {
         "max_prompt_length": 2000,
         "mean_prompt_length": (1200, 1900),
         "mean_max_new_tokens": (60, 128),
-        "tp_size": (1,),
+        "tp_size": 1,
     },
     "mistralai/Mixtral-8x7B-v0.1": {
         "max_prompt_length": 4000,
         "mean_prompt_length": (1200, 2600),
         "mean_max_new_tokens": (60, 128),
-        "tp_size": (4,),
+        "tp_size": 4,
     },
 }
diff --git a/benchmarks/inference/mii/src/run_benchmark.py b/benchmarks/inference/mii/src/run_benchmark.py
index d9cb590ad..821e7b004 100644
--- a/benchmarks/inference/mii/src/run_benchmark.py
+++ b/benchmarks/inference/mii/src/run_benchmark.py
@@ -4,34 +4,31 @@
 # DeepSpeed Team
 
 from client import run_client
-from model_defaults import MODEL_DEFAULTS
 from server import start_server, stop_server
 from utils import (
+    get_args_product,
     parse_args,
     print_summary,
+    results_exist,
     save_json_results,
-    get_args_product,
-    SERVER_PARAMS,
     CLIENT_PARAMS,
+    SERVER_PARAMS,
 )
 
 
 def run_benchmark() -> None:
     args = parse_args(server_args=True, client_args=True)
 
-    if not args.no_model_defaults:
-        if args.model not in MODEL_DEFAULTS:
-            raise ValueError(
-                f"Model {args.model} not in MODEL_DEFAULTS. "
-                f"Please specify arguments manually and use the --no_model_defaults flag."
-            )
-        for k, v in MODEL_DEFAULTS[args.model].items():
-            setattr(args, k, v)
-
     for server_args in get_args_product(args, which=SERVER_PARAMS):
         start_server(server_args)
 
         for client_args in get_args_product(server_args, which=CLIENT_PARAMS):
+            if results_exist(client_args) and not args.overwrite_results:
+                print(
+                    f"Found existing results and skipping current setting. To ignore existing results, use --overwrite_results"
+                )
+                continue
+
             response_details = run_client(client_args)
             print_summary(client_args, response_details)
             save_json_results(client_args, response_details)
diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py
index 892779c1b..e8c9fdda4 100644
--- a/benchmarks/inference/mii/src/utils.py
+++ b/benchmarks/inference/mii/src/utils.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 from typing import Iterator, List
 
+from defaults import ARG_DEFAULTS, MODEL_DEFAULTS
 from postprocess_results import get_summary, ResponseDetails
 
 # For these arguments, users can provide multiple values when running the
@@ -30,37 +31,87 @@ def parse_args(
 
     # Server args
     server_parser = argparse.ArgumentParser(add_help=False)
-    server_parser.add_argument("--tp_size", type=int, nargs="+", default=[1])
     server_parser.add_argument(
-        "--max_ragged_batch_size", type=int, nargs="+", default=[768]
+        "--tp_size", type=int, nargs="+", default=None, help="Tensor parallelism size"
     )
-    server_parser.add_argument("--num_replicas", type=int, nargs="+", default=[1])
     server_parser.add_argument(
-        "cmd", type=str, nargs="?", choices=["start", "stop", "restart"]
+        "--max_ragged_batch_size",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Max batch size for ragged batching",
+    )
+    server_parser.add_argument(
+        "--num_replicas",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Number of MII model replicas",
+    )
+    server_parser.add_argument(
+        "cmd",
+        type=str,
+        nargs="?",
+        choices=["start", "stop", "restart"],
+        help="Command for running server.py to manually start/stop/restart a server",
     )
 
     # Client args
     client_parser = argparse.ArgumentParser(add_help=False)
     client_parser.add_argument(
-        "--mean_prompt_length", type=int, nargs="+", default=[2600]
+        "--max_prompt_length", type=int, default=None, help="Max length a prompt can be"
     )
     client_parser.add_argument(
-        "--mean_max_new_tokens", type=int, nargs="+", default=[60]
+        "--mean_prompt_length",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Mean prompt length in tokens",
+    )
+    client_parser.add_argument(
+        "--mean_max_new_tokens",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Mean number of new tokens to generate per prompt",
     )
     client_parser.add_argument(
         "--num_clients",
         type=int,
         nargs="+",
         default=[1, 2, 4, 6, 8, 12, 16, 20, 24, 28, 32],
+        help="Number of concurrent clients",
+    )
+    client_parser.add_argument(
+        "--num_requests",
+        type=int,
+        default=512,
+        help="Number of requests to process by clients",
+    )
+    client_parser.add_argument(
+        "--prompt_length_var", type=float, default=0.3, help="Variance of prompt length"
+    )
+    client_parser.add_argument(
+        "--max_new_tokens_var",
+        type=float,
+        default=0.3,
+        help="Variance of max new tokens",
+    )
+    client_parser.add_argument(
+        "--warmup", type=int, default=1, help="Number of warmup requests to process"
+    )
+    client_parser.add_argument(
+        "--use_thread", action="store_true", help="Use threads instead of processes"
+    )
+    client_parser.add_argument(
+        "--stream", action="store_true", help="Stream generated tokens"
+    )
+    client_parser.add_argument(
+        "--out_json_dir",
+        type=Path,
+        default="./results/",
+        help="Directory to save result JSON files",
     )
-    client_parser.add_argument("--num_requests", type=int, default=512)
-    client_parser.add_argument("--max_prompt_length", type=int, default=4000)
-    client_parser.add_argument("--prompt_length_var", type=float, default=0.3)
-    client_parser.add_argument("--max_new_tokens_var", type=float, default=0.3)
-    client_parser.add_argument("--warmup", type=int, default=1)
-    client_parser.add_argument("--use_thread", action="store_true")
-    client_parser.add_argument("--stream", action="store_true")
-    client_parser.add_argument("--out_json_dir", type=Path, default="./results/")
 
     # Create the parser, inheriting from the server and/or client parsers
     parents = []
@@ -71,16 +122,40 @@ def parse_args(
 
     # Common args
     parser = argparse.ArgumentParser(parents=parents)
-    parser.add_argument("--model", type=str, default="meta-llama/Llama-2-7b-hf")
     parser.add_argument(
-        "--deployment_name", type=str, default="mii-benchmark-deployment"
+        "--model", type=str, default="meta-llama/Llama-2-7b-hf", help="Model name"
+    )
+    parser.add_argument(
+        "--deployment_name",
+        type=str,
+        default="mii-benchmark-deployment",
+        help="Deployment name for MII server",
+    )
+    parser.add_argument("--vllm", action="store_true", help="Use VLLM instead of MII")
+    parser.add_argument(
+        "--overwrite_results", action="store_true", help="Overwrite existing results"
     )
-    parser.add_argument("--vllm", action="store_true")
-    parser.add_argument("--no_model_defaults", action="store_true")
 
     # Parse arguments
     args = parser.parse_args()
 
+    # Set default values for model-specific parameters
+    if args.model in MODEL_DEFAULTS:
+        for k, v in MODEL_DEFAULTS[args.model].items():
+            if getattr(args, k) is None:
+                setattr(args, k, v)
+
+    # Grab any remaining default values not specified for a model
+    for k, v in ARG_DEFAULTS.items():
+        if getattr(args, k) is None:
+            setattr(args, k, v)
+
+    if not (server_args and client_args):
+        # If we are not running the benchmark, we need to make sure to only have one value for the server args
+        for k in SERVER_PARAMS:
+            if not isinstance(getattr(args, k), int):
+                setattr(args, k, getattr(args, k)[0])
+
     return args
 
 
@@ -89,6 +164,9 @@ def get_args_product(
 ) -> Iterator[argparse.Namespace]:
     if which is None:
         return copy.deepcopy(args)
+    for k in which:
+        if isinstance(getattr(args, k), int):
+            setattr(args, k, [getattr(args, k)])
     arg_values_product = itertools.product(*[getattr(args, k) for k in which])
     for arg_values in arg_values_product:
         args_copy = copy.deepcopy(args)
@@ -120,7 +198,9 @@ def get_results_path(args: argparse.Namespace) -> Path:
     )
 
 
-def print_summary(args, response_details):
+def print_summary(
+    args: argparse.Namespace, response_details: List[ResponseDetails]
+) -> None:
     ps = get_summary(vars(args), response_details)
     print(
         f"Deployment: {args.deployment_name} Clients: {args.num_clients}, "
@@ -134,7 +214,9 @@ def print_summary(args, response_details):
     )
 
 
-def save_json_results(args, response_details):
+def save_json_results(
+    args: argparse.Namespace, response_details: List[ResponseDetails]
+) -> None:
     args_dict = vars(args)
     out_json_path = get_results_path(args)
     os.makedirs(out_json_path.parent, exist_ok=True)
@@ -147,3 +229,7 @@ def save_json_results(args, response_details):
             "response_details": [asdict(r) for r in response_details],
         }
         json.dump(data, f, indent=2)
+
+
+def results_exist(args: argparse.Namespace) -> bool:
+    return get_results_path(args).exists()

From 5d0c607a7f545ae9db42108ad48e267c0b6ddaba Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 16:40:14 -0800
Subject: [PATCH 09/14] catch errors when starting vllm server

---
 benchmarks/inference/mii/run_all.sh    | 2 +-
 benchmarks/inference/mii/src/server.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
index 67cf80e1f..88ff7c8cb 100644
--- a/benchmarks/inference/mii/run_all.sh
+++ b/benchmarks/inference/mii/run_all.sh
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
+MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
 
 for MODEL in ${MODELS[@]}; do
     python ./src/run_benchmark.py --model ${MODEL} --stream
diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py
index 73ee3c4a6..8bfc9f441 100644
--- a/benchmarks/inference/mii/src/server.py
+++ b/benchmarks/inference/mii/src/server.py
@@ -49,12 +49,16 @@ def start_vllm_server(model: str, tp_size: int) -> None:
     )
     p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     start_time = time.time()
-    timeout_after = 60 * 2  # 2 minutes
+    timeout_after = 60 * 5  # 5 minutes
     while True:
         line = p.stderr.readline().decode("utf-8")
         if "Application startup complete" in line:
             break
         time.sleep(1)
+        if "ERROR" in line:
+            p.terminate()
+            stop_vllm_server()
+            raise RuntimeError(f"Error starting VLLM server: {line}")
         if time.time() - start_time > timeout_after:
             p.terminate()
             stop_vllm_server()

From c13c327c7e39c47e9ddd9e8a9522382402865d6a Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Thu, 18 Jan 2024 17:12:01 -0800
Subject: [PATCH 10/14] fix error catching with vllm

---
 benchmarks/inference/mii/src/defaults.py | 1 -
 benchmarks/inference/mii/src/server.py   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/inference/mii/src/defaults.py b/benchmarks/inference/mii/src/defaults.py
index 7ebe6f3a7..79ce91c97 100644
--- a/benchmarks/inference/mii/src/defaults.py
+++ b/benchmarks/inference/mii/src/defaults.py
@@ -18,7 +18,6 @@
         "mean_prompt_length": (1200, 2600),
         "mean_max_new_tokens": (60, 128),
         "tp_size": 1,
-        "num_replicas": (1, 8),
     },
     "meta-llama/Llama-13b-hf": {
         "max_prompt_length": 4000,
diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py
index 8bfc9f441..9b5c07740 100644
--- a/benchmarks/inference/mii/src/server.py
+++ b/benchmarks/inference/mii/src/server.py
@@ -54,8 +54,7 @@ def start_vllm_server(model: str, tp_size: int) -> None:
         line = p.stderr.readline().decode("utf-8")
         if "Application startup complete" in line:
             break
-        time.sleep(1)
-        if "ERROR" in line:
+        if "error" in line.lower():
             p.terminate()
             stop_vllm_server()
             raise RuntimeError(f"Error starting VLLM server: {line}")
@@ -63,6 +62,7 @@ def start_vllm_server(model: str, tp_size: int) -> None:
             p.terminate()
             stop_vllm_server()
             raise TimeoutError("Timed out waiting for VLLM server to start")
+        time.sleep(0.01)
 
 
 def start_mii_server(

From 71447a71b8f90298d31312074d59bdd1c495f480 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Fri, 19 Jan 2024 09:37:19 -0800
Subject: [PATCH 11/14] update README

---
 benchmarks/inference/mii/README.md            | 49 +++++++++++++------
 benchmarks/inference/mii/run_all.sh           |  8 +--
 .../inference/mii/{src => }/run_benchmark.py  |  6 +--
 benchmarks/inference/mii/run_example.sh       |  2 +-
 4 files changed, 43 insertions(+), 22 deletions(-)
 rename benchmarks/inference/mii/{src => }/run_benchmark.py (90%)

diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md
index d9e475cdb..092ac4867 100644
--- a/benchmarks/inference/mii/README.md
+++ b/benchmarks/inference/mii/README.md
@@ -2,38 +2,59 @@
 
 ## Run the Benchmark
 
-The benchmarking scripts use DeepSpeed-FastGen in the persistent mode.
-You can start the server with the command below:
+The benchmarking scripts use DeepSpeed-FastGen in the persistent mode. You can
+run the benchmark using `run_benchmark.py`. This script will run several
+combinations of inference servers and clients with different tensor parallel
+size, number of model replicas (MII only), number of clients, prompt length, and
+max new tokens values. By default, the benchmark will run with the `meta-llama/Llama-2-7b-hf` model.
 
 ```bash
-python server.py [options] start
+python run_benchmark.py
 ```
 
-Use the -h option to view all available options. To stop the server, use this command:
+Use the -h option to view all available options. Several models have pre-defined
+default values, including `meta-llama/Llama-2-{7|13|70}b-hf`,
+`tiiuae/falcon-{40|180}B`, `microsoft/phi-2`, and `mistralai/Mixtral-8x7B-v0.1`.
+These defaults can be overridden if provided to the `run_benchmark.py` script.
+For example, to run `meta-llama/Llama-13b-hf` with a tensor parallel size of `1`
+and `2` (instead of the default `1`, `2`, and `4`):
 
-```bash		     
-python server.py stop
+```bash
+python run_benchmark.py --tp_size 1 2
 ```
 
-Once the server is up and running, initiate the client using the command below. The -h option will display all the possible options.
+By default the benchmark runs with DeepSpeed-MII as the backend inference
+server. To change the backend to vLLM, provide the `--vllm` flag:
 
 ```bash
-python run_benchmark_client.py [options]
+python run_benchmark.py --vllm
 ```
 
-The run_all.sh script performs benchmarks across various model sizes and client numbers. For VLLM benchmarks, use the run_all_vllm.sh script. Results are logged in a directory named logs.[BENCHMARK_PARAMETERS].
+The run_all.sh script performs benchmarks across various models, client numbers,
+tensor parallel sizes, etc. This script is intended to be run on a system with
+8xA100 (80GB) GPUs available. It will run all the benchmarks (including vLLM)
+and collect the data used in our [DeepSpeed-Fastgen
+blogs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
+Results are collected in `./results/`.
 
 ## Analyze the Benchmark Results
 
-The scripts mentioned below were used for generating the plots featured in our blog. Specify the root directory for log files using --log_dir.
+The scripts mentioned below were used for generating the plots featured in our
+blog. Specify the root directory for log files using `--log_dir`. The generated
+figures will be saved to `./plots/`
 
-- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts.
-- `plot_effective_throughput.py`: Use this to chart effective throughput.
-- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies.
+- `src/plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts.
+- `src/plot_effective_throughput.py`: Use this to chart effective throughput.
+- `src/plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies.
 
 ## Running an End-to-End Example
 
-To quickly experience the end-to-end process of running our benchmark and getting results, you can use the `run_example.sh`. This script is designed to execute the benchmark with a specific configuration. The plots below will be generated in the charts directory. These plots show the performance as depicted in figure 8 of our blog [post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms)
+To quickly experience the end-to-end process of running our benchmark and
+getting results, you can use the `run_example.sh`. This script is designed to
+execute the benchmark with a specific configuration. The plots below will be
+generated in the `./plots/` directory. These plots show the performance as
+depicted in figure 8 of our blog
+[post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms)
 	   
 ```bash
 bash run_example.sh
diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
index 88ff7c8cb..095b3ae12 100644
--- a/benchmarks/inference/mii/run_all.sh
+++ b/benchmarks/inference/mii/run_all.sh
@@ -6,10 +6,10 @@
 MODELS=(meta-llama/Llama-2-7b-hf meta-llama/Llama-2-13b-hf meta-llama/Llama-2-70b-hf tiiuae/falcon-40B tiiuae/falcon-180B microsoft/phi-2 mistralai/Mixtral-8x7B-v0.1)
 
 for MODEL in ${MODELS[@]}; do
-    python ./src/run_benchmark.py --model ${MODEL} --stream
-    python ./src/run_benchmark.py --model ${MODEL} --stream --vllm
+    python ./run_benchmark.py --model ${MODEL} --stream
+    python ./run_benchmark.py --model ${MODEL} --stream --vllm
 done
 
 # Extra runs for Mixtral with non-default settings
-python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024
-python ./src/run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
\ No newline at end of file
+python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024
+python ./run_benchmark.py --model mistralai/Mixtral-8x7B-v0.1 --stream --tp_size 4 --mean_prompt_length 500 --mean_max_new_tokens 150 500 1024 --vllm
\ No newline at end of file
diff --git a/benchmarks/inference/mii/src/run_benchmark.py b/benchmarks/inference/mii/run_benchmark.py
similarity index 90%
rename from benchmarks/inference/mii/src/run_benchmark.py
rename to benchmarks/inference/mii/run_benchmark.py
index 821e7b004..96e88155f 100644
--- a/benchmarks/inference/mii/src/run_benchmark.py
+++ b/benchmarks/inference/mii/run_benchmark.py
@@ -3,9 +3,9 @@
 
 # DeepSpeed Team
 
-from client import run_client
-from server import start_server, stop_server
-from utils import (
+from src.client import run_client
+from src.server import start_server, stop_server
+from src.utils import (
     get_args_product,
     parse_args,
     print_summary,
diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
index 792b9a188..e80253828 100644
--- a/benchmarks/inference/mii/run_example.sh
+++ b/benchmarks/inference/mii/run_example.sh
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 # Run benchmark
-python ./src/run_benchmark.py \
+python ./run_benchmark.py \
         --model meta-llama/Llama-2-7b-hf \
         --tp_size 1 \
         --num_replicas 1 \

From 682e904641e4dbea526ea825f33b69777752183b Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Fri, 19 Jan 2024 09:42:09 -0800
Subject: [PATCH 12/14] added requirement

---
 benchmarks/inference/mii/requirements.txt            |  5 +++++
 benchmarks/inference/mii/src/client.py               |  8 ++++----
 .../inference/mii/src/plot_effective_throughput.py   |  2 +-
 .../inference/mii/src/plot_latency_percentile.py     |  2 +-
 benchmarks/inference/mii/src/plot_repl_scale.py      |  2 +-
 benchmarks/inference/mii/src/plot_th_lat.py          | 12 +++++++-----
 benchmarks/inference/mii/src/plot_tp_sizes.py        |  2 +-
 benchmarks/inference/mii/src/server.py               |  2 +-
 benchmarks/inference/mii/src/utils.py                |  4 ++--
 9 files changed, 23 insertions(+), 16 deletions(-)
 create mode 100644 benchmarks/inference/mii/requirements.txt

diff --git a/benchmarks/inference/mii/requirements.txt b/benchmarks/inference/mii/requirements.txt
new file mode 100644
index 000000000..7ac014ef8
--- /dev/null
+++ b/benchmarks/inference/mii/requirements.txt
@@ -0,0 +1,5 @@
+transformers
+matplotlib
+deepspeed-mii>=0.2.0
+vllm>=0.2.7
+numpy
\ No newline at end of file
diff --git a/benchmarks/inference/mii/src/client.py b/benchmarks/inference/mii/src/client.py
index 190ccd443..c440d0b63 100644
--- a/benchmarks/inference/mii/src/client.py
+++ b/benchmarks/inference/mii/src/client.py
@@ -17,10 +17,10 @@
 import numpy as np
 from transformers import AutoTokenizer
 
-from postprocess_results import ResponseDetails
-from random_query_generator import RandomQueryGenerator
-from sample_input import all_text
-from utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS
+from .postprocess_results import ResponseDetails
+from .random_query_generator import RandomQueryGenerator
+from .sample_input import all_text
+from .utils import parse_args, print_summary, get_args_product, CLIENT_PARAMS
 
 
 def call_mii(client, input_tokens, max_new_tokens, stream):
diff --git a/benchmarks/inference/mii/src/plot_effective_throughput.py b/benchmarks/inference/mii/src/plot_effective_throughput.py
index a81308774..efa471c76 100644
--- a/benchmarks/inference/mii/src/plot_effective_throughput.py
+++ b/benchmarks/inference/mii/src/plot_effective_throughput.py
@@ -10,7 +10,7 @@
 import numpy as np
 import pandas as pd
 
-from postprocess_results import read_json, get_tokenizer
+from .postprocess_results import read_json, get_tokenizer
 
 RAGGED_BATCH_SIZE = 768
 SLA_PROMPT_TOKENS_PER_SEC = 512
diff --git a/benchmarks/inference/mii/src/plot_latency_percentile.py b/benchmarks/inference/mii/src/plot_latency_percentile.py
index 1c1df024b..9b08f12da 100644
--- a/benchmarks/inference/mii/src/plot_latency_percentile.py
+++ b/benchmarks/inference/mii/src/plot_latency_percentile.py
@@ -10,7 +10,7 @@
 import numpy as np
 import itertools
 
-from postprocess_results import read_json, get_token_latency
+from .postprocess_results import read_json, get_token_latency
 
 bs = 768
 SKIP_HEAD_TOKEN_NUM = 2
diff --git a/benchmarks/inference/mii/src/plot_repl_scale.py b/benchmarks/inference/mii/src/plot_repl_scale.py
index e52eb775e..7791be0ca 100644
--- a/benchmarks/inference/mii/src/plot_repl_scale.py
+++ b/benchmarks/inference/mii/src/plot_repl_scale.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 import numpy as np
 
-from postprocess_results import read_json, get_summary
+from .postprocess_results import read_json, get_summary
 
 bs = 768
 
diff --git a/benchmarks/inference/mii/src/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py
index 87063bd6f..b6ac0163e 100644
--- a/benchmarks/inference/mii/src/plot_th_lat.py
+++ b/benchmarks/inference/mii/src/plot_th_lat.py
@@ -3,14 +3,16 @@
 
 # DeepSpeed Team
 
-import glob
-import matplotlib.pyplot as plt
 import argparse
-from pathlib import Path
-import numpy as np
-from postprocess_results import read_json, get_summary
+import glob
 import os
 import re
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .postprocess_results import read_json, get_summary
 
 
 def get_args():
diff --git a/benchmarks/inference/mii/src/plot_tp_sizes.py b/benchmarks/inference/mii/src/plot_tp_sizes.py
index 15dc06bb8..f02b643f2 100644
--- a/benchmarks/inference/mii/src/plot_tp_sizes.py
+++ b/benchmarks/inference/mii/src/plot_tp_sizes.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 import numpy as np
 
-from postprocess_results import read_json, get_summary
+from .postprocess_results import read_json, get_summary
 
 bs = 768
 
diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py
index 9b5c07740..5b55f90e1 100644
--- a/benchmarks/inference/mii/src/server.py
+++ b/benchmarks/inference/mii/src/server.py
@@ -10,7 +10,7 @@
 from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig
 from deepspeed.inference.v2.ragged import DSStateManagerConfig
 
-from utils import parse_args, SERVER_PARAMS
+from .utils import parse_args, SERVER_PARAMS
 
 
 def start_server(args):
diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py
index e8c9fdda4..46892ce0b 100644
--- a/benchmarks/inference/mii/src/utils.py
+++ b/benchmarks/inference/mii/src/utils.py
@@ -14,8 +14,8 @@
 from pathlib import Path
 from typing import Iterator, List
 
-from defaults import ARG_DEFAULTS, MODEL_DEFAULTS
-from postprocess_results import get_summary, ResponseDetails
+from .defaults import ARG_DEFAULTS, MODEL_DEFAULTS
+from .postprocess_results import get_summary, ResponseDetails
 
 # For these arguments, users can provide multiple values when running the
 # benchmark. The benchmark will iterate over all possible combinations.

From fdaa90742113ddeca952b97827201975cbe64a37 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Fri, 19 Jan 2024 10:46:52 -0800
Subject: [PATCH 13/14] fix hang with vllm runs

---
 benchmarks/inference/mii/src/server.py | 13 +++----------
 benchmarks/inference/mii/src/utils.py  |  6 +++---
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py
index 5b55f90e1..d0ecabaf3 100644
--- a/benchmarks/inference/mii/src/server.py
+++ b/benchmarks/inference/mii/src/server.py
@@ -47,7 +47,9 @@ def start_vllm_server(model: str, tp_size: int) -> None:
         "--model",
         model,
     )
-    p = subprocess.Popen(vllm_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    p = subprocess.Popen(
+        vllm_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, close_fds=True
+    )
     start_time = time.time()
     timeout_after = 60 * 5  # 5 minutes
     while True:
@@ -109,15 +111,6 @@ def stop_mii_server(deployment_name):
 if __name__ == "__main__":
     args = parse_args(server_args=True)
 
-    # Make sure only single values were passed for parameters, multiple values
-    # can be used with the run_benchmark.py script
-    for param in SERVER_PARAMS:
-        if len(getattr(args, param)) > 1:
-            raise ValueError(
-                f"Cannot specify multiple values for {param} when running server"
-            )
-        setattr(args, param, getattr(args, param)[0])
-
     if args.cmd == "start":
         start_server(args)
     elif args.cmd == "stop":
diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py
index 46892ce0b..6499a54b4 100644
--- a/benchmarks/inference/mii/src/utils.py
+++ b/benchmarks/inference/mii/src/utils.py
@@ -142,15 +142,15 @@ def parse_args(
     # Set default values for model-specific parameters
     if args.model in MODEL_DEFAULTS:
         for k, v in MODEL_DEFAULTS[args.model].items():
-            if getattr(args, k) is None:
+            if hasattr(args, k) and getattr(args, k) is None:
                 setattr(args, k, v)
 
     # Grab any remaining default values not specified for a model
     for k, v in ARG_DEFAULTS.items():
-        if getattr(args, k) is None:
+        if hasattr(args, k) and getattr(args, k) is None:
             setattr(args, k, v)
 
-    if not (server_args and client_args):
+    if server_args and not client_args:
         # If we are not running the benchmark, we need to make sure to only have one value for the server args
         for k in SERVER_PARAMS:
             if not isinstance(getattr(args, k), int):

From a8e5c5e16e954bd139065782221d3ae278ec7900 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Fri, 19 Jan 2024 14:11:37 -0800
Subject: [PATCH 14/14] fix import

---
 benchmarks/inference/mii/src/plot_th_lat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/inference/mii/src/plot_th_lat.py b/benchmarks/inference/mii/src/plot_th_lat.py
index b6ac0163e..9aa292ca6 100644
--- a/benchmarks/inference/mii/src/plot_th_lat.py
+++ b/benchmarks/inference/mii/src/plot_th_lat.py
@@ -12,7 +12,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
-from .postprocess_results import read_json, get_summary
+from postprocess_results import read_json, get_summary
 
 
 def get_args():