diff --git a/dev/bench/data.js b/dev/bench/data.js index adefe31..161a649 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,54 +1,8 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1737701945375, + "lastUpdate": 1737771196536, "repoUrl": "https://github.com/neuralmagic/nm-vllm-ent", "entries": { "smaller_is_better": [ - { - "commit": { - "author": { - "name": "Andy Linfoot", - "username": "andy-neuma", - "email": "78757007+andy-neuma@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "0507e27d49749501ebf2db85d210dee03da59315", - "message": "Remove `magic_wand` (#172)\n\nSUMMARY:\r\n* remove \"magic wand\" from \"nm-vllm\"\r\n* update neural magic docker\r\n* update `collect_env.py`\r\n\r\nNOTE: final run was cancelled, since it was just @derekk-nm disabling a\r\ntest ... changes ran green ...\r\nhttps://github.com/neuralmagic/nm-vllm-ent/actions/runs/12602081688\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\n---------\r\n\r\nCo-authored-by: andy-neuma ", - "timestamp": "2025-01-03T19:52:36Z", - "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/0507e27d49749501ebf2db85d210dee03da59315" - }, - "date": 1736318482512, - "tool": "customSmallerIsBetter", - "benches": [ - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 137.0293802022934, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:19:34 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 25.2788263601787, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:19:34 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 588.6454928200692, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:39:55 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" - }, - { - "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", - "value": 20.957360656216675, - "unit": "ms", - "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250108\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - mistralai/Mixtral-8x7B-Instruct-v0.1\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"tokenizer\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'tokenizer': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-08 06:39:55 UTC\", \"model\": \"mistralai/Mixtral-8x7B-Instruct-v0.1\", \"dataset\": \"sharegpt\"}" - } - ] - }, { "commit": { "author": { @@ -2302,6 +2256,52 @@ window.BENCHMARK_DATA = { "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250124\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132), _CudaDeviceProperties(name='NVIDIA H100 80GB HBM3', major=9, minor=0, total_memory=80994MB, multi_processor_count=132)]\", \"cuda_device_names\": [\"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\", \"NVIDIA H100 80GB HBM3\"]}, \"gpu_description\": \"NVIDIA H100 80GB HBM3 x 4\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-70B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 4, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-70B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-70B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 4, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-24 06:37:11 UTC\", \"model\": \"meta-llama/Meta-Llama-3-70B-Instruct\", \"dataset\": \"sharegpt\"}" } ] + }, + { + "commit": { + "author": { + "name": "Domenic Barbuzzi", + "username": "dbarbuzzi", + "email": "domenic@neuralmagic.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "4e51ab3e0aac991d2ded2bb55fd5a957e8b1477a", + "message": "Use pytest-nm-releng plugin for reporting (#176)\n\nThis PR updates the command-running action/scripts to use the\n[`pytest-nm-releng`](https://github.com/neuralmagic/pytest-nm-releng)\npytest plugin for the creation of JUnit reports and code coverage\nreports (when enabled).\n\nThe previous method had the command runner script checking if the\ncommand being run was a `pytest` command and, if so, it would append the\nappropriate CLI flags based on what was enabled.\n\nThis was problematic if tests were being executed indirectly; namely, if\nthe command runner script was running something like a Bash script which\nin turn ran the pytest commands. This prevented the command runner from\nbeing able to append the CLI flags to create uniquely named report files\nand, as a result, those tests would not have results captured and\nreported outside the action/job.\n\nWith the new plugin, the new method does away with that and simply sets\n2-3 env vars before running the commands (based on what features are\nenabled) and lets the pytest plugin do the heavy lifting of generating\nunique JUnit report names without needing to append any CLI flags to any\ncommands.", + "timestamp": "2025-01-09T16:54:04Z", + "url": "https://github.com/neuralmagic/nm-vllm-ent/commit/4e51ab3e0aac991d2ded2bb55fd5a957e8b1477a" + }, + "date": 1737771194969, + "tool": "customSmallerIsBetter", + "benches": [ + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 236.603734770005, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250125\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22593MB, multi_processor_count=58)]\", \"cuda_device_names\": [\"NVIDIA L4\"]}, \"gpu_description\": \"NVIDIA L4 x 1\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 1, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-25 02:12:01 UTC\", \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 83.35819514630768, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250125\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22593MB, multi_processor_count=58)]\", \"cuda_device_names\": [\"NVIDIA L4\"]}, \"gpu_description\": \"NVIDIA L4 x 1\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"tokenizer\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 1, \"server_args\": \"{'model': 'meta-llama/Meta-Llama-3-8B-Instruct', 'tokenizer': 'meta-llama/Meta-Llama-3-8B-Instruct', 'max-model-len': 4096, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-25 02:12:01 UTC\", \"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 36.501849123334296, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250125\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22593MB, multi_processor_count=58)]\", \"cuda_device_names\": [\"NVIDIA L4\"]}, \"gpu_description\": \"NVIDIA L4 x 1\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"facebook/opt-350m\", \"tokenizer\": \"facebook/opt-350m\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 1, \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-25 02:02:45 UTC\", \"model\": \"facebook/opt-350m\", \"dataset\": \"sharegpt\"}" + }, + { + "name": "{\"name\": \"mean_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA L4 x 1\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\"}", + "value": 5.989375408393708, + "unit": "ms", + "extra": "{\"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"benchmarking_context\": {\"vllm_version\": \"0.6.3.0.20250125\", \"python_version\": \"3.10.12\", \"torch_version\": \"2.4.0+cu121\", \"torch_cuda_version\": \"12.1\", \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22593MB, multi_processor_count=58)]\", \"cuda_device_names\": [\"NVIDIA L4\"]}, \"gpu_description\": \"NVIDIA L4 x 1\", \"script_name\": \"benchmark_serving.py\", \"script_args\": {\"description\": \"VLLM Serving - Dense\\nmodel - facebook/opt-350m\\nmax-model-len - 2048\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"backend\": \"vllm\", \"version\": \"N/A\", \"base_url\": null, \"host\": \"127.0.0.1\", \"port\": 9000, \"endpoint\": \"/generate\", \"dataset\": \"sharegpt\", \"num_input_tokens\": null, \"num_output_tokens\": null, \"model\": \"facebook/opt-350m\", \"tokenizer\": \"facebook/opt-350m\", \"best_of\": 1, \"use_beam_search\": false, \"log_model_io\": false, \"seed\": 0, \"trust_remote_code\": false, \"disable_tqdm\": false, \"save_directory\": \"benchmark-results\", \"num_prompts_\": null, \"request_rate_\": null, \"nr_qps_pair_\": [300, \"1.0\"], \"server_tensor_parallel_size\": 1, \"server_args\": \"{'model': 'facebook/opt-350m', 'tokenizer': 'facebook/opt-350m', 'max-model-len': 2048, 'host': '127.0.0.1', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"}, \"date\": \"2025-01-25 02:02:45 UTC\", \"model\": \"facebook/opt-350m\", \"dataset\": \"sharegpt\"}" + } + ] } ] }