diff --git a/src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml b/src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml new file mode 100644 index 00000000..d35b1238 --- /dev/null +++ b/src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml @@ -0,0 +1,210 @@ +general: + name: "fmbench-bedrock-llama3-1" + model_name: "Llama3-1-70b Model on Amazon Bedrock" + +# AWS and SageMaker settings +aws: + # AWS region, this parameter is templatized, no need to change + region: {region} + # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change + sagemaker_execution_role: {role_arn} + # S3 bucket to which metrics, plots and reports would be written to + bucket: {write_bucket} ## add the name of your desired bucket + +# directory paths in the write bucket, no need to change these +dir_paths: + data_prefix: data + prompts_prefix: prompts + all_prompts_file: all_prompts.csv + metrics_dir: metrics + models_dir: models + metadata_dir: metadata + +# S3 information for reading datasets, scripts and tokenizer +s3_read_data: + # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id + read_bucket: {read_bucket} + scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart + + # S3 prefix in the read bucket where deployment and inference scripts should be placed + scripts_prefix: scripts + + # deployment and inference script files to be downloaded are placed in this list + # only needed if you are creating a new deployment script or inference script + # your HuggingFace token does need to be in this list and should be called "hf_token.txt" + script_files: + - hf_token.txt + + # configuration files (like this one) are placed in this prefix + configs_prefix: configs + + # list of configuration files to download, for now only pricing.yml needs to be downloaded + config_files: + - pricing.yml + + # S3 prefix for the dataset files + source_data_prefix: source_data + # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench + source_data_files: + - 2wikimqa_e.jsonl + - 2wikimqa.jsonl + - hotpotqa_e.jsonl + - hotpotqa.jsonl + - narrativeqa.jsonl + - triviaqa_e.jsonl + - triviaqa.jsonl + + # S3 prefix for the tokenizer to be used with the models + # NOTE 1: the same tokenizer is used with all the models being tested through a config file + # NOTE 2: place your model specific tokenizers in a prefix named as _tokenizer + # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer + tokenizer_prefix: llama3_1_tokenizer + + # S3 prefix for prompt templates + prompt_template_dir: prompt_template + + # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file + # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one + prompt_template_file: prompt_template_llama3.txt + +# steps to run, usually all of these would be +# set to yes so nothing needs to change here +# you could, however, bypass some steps for example +# set the 2_deploy_model.ipynb to no if you are re-running +# the same config file and the model is already deployed +run_steps: + 0_setup.ipynb: yes + 1_generate_data.ipynb: yes + 2_deploy_model.ipynb: no + 3_run_inference.ipynb: yes + 4_model_metric_analysis.ipynb: yes + 5_cleanup.ipynb: no + +datasets: + # dataset related configuration + prompt_template_keys: + - input + - context + + # if your dataset has multiple languages and it has a language + # field then you could filter it for a language. Similarly, + # you can filter your dataset to only keep prompts between + # a certain token length limit (the token length is determined + # using the tokenizer you provide in the tokenizer_prefix prefix in the + # read S3 bucket). Each of the array entries below create a payload file + # containing prompts matching the language and token length criteria. + filters: + - language: en + min_length_in_tokens: 1 + max_length_in_tokens: 500 + payload_file: payload_en_1-500.jsonl + - language: en + min_length_in_tokens: 500 + max_length_in_tokens: 1000 + payload_file: payload_en_500-1000.jsonl + - language: en + min_length_in_tokens: 1000 + max_length_in_tokens: 2000 + payload_file: payload_en_1000-2000.jsonl + - language: en + min_length_in_tokens: 2000 + max_length_in_tokens: 3000 + payload_file: payload_en_2000-3000.jsonl + - language: en + min_length_in_tokens: 3000 + max_length_in_tokens: 4000 + payload_file: payload_en_3000-4000.jsonl + - language: en + min_length_in_tokens: 3000 + max_length_in_tokens: 3840 + payload_file: payload_en_3000-3840.jsonl + +# While the tests would run on all the datasets +# configured in the experiment entries below but +# the price:performance analysis is only done for 1 +# dataset which is listed below as the dataset_of_interest +metrics: + dataset_of_interest: en_3000-3840 + +# all pricing information is in the pricing.yml file +# this file is provided in the repo. You can add entries +# to this file for new instance types and new Bedrock models +pricing: pricing.yml + +# inference parameters, these are added to the payload +# for each inference request. The list here is not static +# any parameter supported by the inference container can be +# added to the list. Put the sagemaker parameters in the sagemaker +# section, bedrock parameters in the bedrock section (not shown here). +# Use the section name (sagemaker in this example) in the inference_spec.parameter_set +# section under experiments. +inference_parameters: + bedrock: + temperature: 0.1 + max_tokens: 100 + top_p: 0.92 + caching: False + +# Configuration for experiments to be run. The experiments section is an array +# so more than one experiments can be added, these could belong to the same model +# but different instance types, or different models, or even different hosting +# options (such as one experiment is SageMaker and the other is Bedrock). +experiments: + - name: meta.llama3-1-70b-instruct-v1:0 + # model_id is interpreted in conjunction with the deployment_script, so if you + # use a JumpStart model id then set the deployment_script to jumpstart.py. + # if deploying directly from HuggingFace this would be a HuggingFace model id + # see the DJL serving deployment script in the code repo for reference. + model_id: meta.llama3-1-70b-instruct-v1:0 + model_version: + model_name: meta.llama3-1-70b-instruct-v1:0 + ep_name: meta.llama3-1-70b-instruct-v1:0 + instance_type: meta.llama3-1-70b-instruct-v1:0 + image_uri: + deploy: no + # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart + # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own. + # See repo for details + instance_count: + deployment_script: + # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker + # and Bedrock. You can also add your own. See repo for details + inference_script: bedrock_predictor.py + inference_spec: + split_input_and_parameters: no + # this should match one of the sections in the inference_parameters section above + parameter_set: bedrock + # to stream responses, set stream to true. Enter the start and stop token for the + # Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT) + # metrics to be calculated. The responses from bedrock stream is received in chunks, so mention + # the stop token only. + stream: True + start_token: + stop_token: "<|eot_id|>" + # runs are done for each combination of payload file and concurrency level + payload_files: + - payload_en_1-500.jsonl + - payload_en_500-1000.jsonl + - payload_en_1000-2000.jsonl + - payload_en_2000-3000.jsonl + - payload_en_3000-3840.jsonl + # concurrency level refers to number of requests sent in parallel to an endpoint + # the next set of requests is sent once responses for all concurrent requests have + # been received. + + # for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench + concurrency_levels: + - 1 + # Environment variables to be passed to the container + # this is not a fixed list, you can add more parameters as applicable. + env: + +report: + latency_budget: 2 + cost_per_10k_txn_budget: 100 + error_rate_budget: 0 + per_inference_request_file: per_inference_request_results.csv + all_metrics_file: all_metrics.csv + txn_count_for_showing_cost: 10000 + v_shift_w_single_instance: 0.025 + v_shift_w_gt_one_instance: 0.025 diff --git a/src/fmbench/configs/bedrock/config-bedrock-llama3-1-8b-streaming.yml b/src/fmbench/configs/bedrock/config-bedrock-llama3-1-8b-streaming.yml new file mode 100644 index 00000000..759f963c --- /dev/null +++ b/src/fmbench/configs/bedrock/config-bedrock-llama3-1-8b-streaming.yml @@ -0,0 +1,210 @@ +general: + name: "fmbench-bedrock-llama3-1" + model_name: "Llama3-1-8b Model on Amazon Bedrock" + +# AWS and SageMaker settings +aws: + # AWS region, this parameter is templatized, no need to change + region: {region} + # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change + sagemaker_execution_role: {role_arn} + # S3 bucket to which metrics, plots and reports would be written to + bucket: {write_bucket} ## add the name of your desired bucket + +# directory paths in the write bucket, no need to change these +dir_paths: + data_prefix: data + prompts_prefix: prompts + all_prompts_file: all_prompts.csv + metrics_dir: metrics + models_dir: models + metadata_dir: metadata + +# S3 information for reading datasets, scripts and tokenizer +s3_read_data: + # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id + read_bucket: {read_bucket} + scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart + + # S3 prefix in the read bucket where deployment and inference scripts should be placed + scripts_prefix: scripts + + # deployment and inference script files to be downloaded are placed in this list + # only needed if you are creating a new deployment script or inference script + # your HuggingFace token does need to be in this list and should be called "hf_token.txt" + script_files: + - hf_token.txt + + # configuration files (like this one) are placed in this prefix + configs_prefix: configs + + # list of configuration files to download, for now only pricing.yml needs to be downloaded + config_files: + - pricing.yml + + # S3 prefix for the dataset files + source_data_prefix: source_data + # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench + source_data_files: + - 2wikimqa_e.jsonl + - 2wikimqa.jsonl + - hotpotqa_e.jsonl + - hotpotqa.jsonl + - narrativeqa.jsonl + - triviaqa_e.jsonl + - triviaqa.jsonl + + # S3 prefix for the tokenizer to be used with the models + # NOTE 1: the same tokenizer is used with all the models being tested through a config file + # NOTE 2: place your model specific tokenizers in a prefix named as _tokenizer + # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer + tokenizer_prefix: llama3_1_tokenizer + + # S3 prefix for prompt templates + prompt_template_dir: prompt_template + + # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file + # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one + prompt_template_file: prompt_template_llama3.txt + +# steps to run, usually all of these would be +# set to yes so nothing needs to change here +# you could, however, bypass some steps for example +# set the 2_deploy_model.ipynb to no if you are re-running +# the same config file and the model is already deployed +run_steps: + 0_setup.ipynb: yes + 1_generate_data.ipynb: yes + 2_deploy_model.ipynb: no + 3_run_inference.ipynb: yes + 4_model_metric_analysis.ipynb: yes + 5_cleanup.ipynb: no + +datasets: + # dataset related configuration + prompt_template_keys: + - input + - context + + # if your dataset has multiple languages and it has a language + # field then you could filter it for a language. Similarly, + # you can filter your dataset to only keep prompts between + # a certain token length limit (the token length is determined + # using the tokenizer you provide in the tokenizer_prefix prefix in the + # read S3 bucket). Each of the array entries below create a payload file + # containing prompts matching the language and token length criteria. + filters: + - language: en + min_length_in_tokens: 1 + max_length_in_tokens: 500 + payload_file: payload_en_1-500.jsonl + - language: en + min_length_in_tokens: 500 + max_length_in_tokens: 1000 + payload_file: payload_en_500-1000.jsonl + - language: en + min_length_in_tokens: 1000 + max_length_in_tokens: 2000 + payload_file: payload_en_1000-2000.jsonl + - language: en + min_length_in_tokens: 2000 + max_length_in_tokens: 3000 + payload_file: payload_en_2000-3000.jsonl + - language: en + min_length_in_tokens: 3000 + max_length_in_tokens: 4000 + payload_file: payload_en_3000-4000.jsonl + - language: en + min_length_in_tokens: 3000 + max_length_in_tokens: 3840 + payload_file: payload_en_3000-3840.jsonl + +# While the tests would run on all the datasets +# configured in the experiment entries below but +# the price:performance analysis is only done for 1 +# dataset which is listed below as the dataset_of_interest +metrics: + dataset_of_interest: en_3000-3840 + +# all pricing information is in the pricing.yml file +# this file is provided in the repo. You can add entries +# to this file for new instance types and new Bedrock models +pricing: pricing.yml + +# inference parameters, these are added to the payload +# for each inference request. The list here is not static +# any parameter supported by the inference container can be +# added to the list. Put the sagemaker parameters in the sagemaker +# section, bedrock parameters in the bedrock section (not shown here). +# Use the section name (sagemaker in this example) in the inference_spec.parameter_set +# section under experiments. +inference_parameters: + bedrock: + temperature: 0.1 + max_tokens: 100 + top_p: 0.92 + caching: False + +# Configuration for experiments to be run. The experiments section is an array +# so more than one experiments can be added, these could belong to the same model +# but different instance types, or different models, or even different hosting +# options (such as one experiment is SageMaker and the other is Bedrock). +experiments: + - name: meta.llama3-1-8b-instruct-v1:0 + # model_id is interpreted in conjunction with the deployment_script, so if you + # use a JumpStart model id then set the deployment_script to jumpstart.py. + # if deploying directly from HuggingFace this would be a HuggingFace model id + # see the DJL serving deployment script in the code repo for reference. + model_id: meta.llama3-1-8b-instruct-v1:0 + model_version: + model_name: meta.llama3-1-8b-instruct-v1:0 + ep_name: meta.llama3-1-8b-instruct-v1:0 + instance_type: meta.llama3-1-8b-instruct-v1:0 + image_uri: + deploy: no + # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart + # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own. + # See repo for details + instance_count: + deployment_script: + # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker + # and Bedrock. You can also add your own. See repo for details + inference_script: bedrock_predictor.py + inference_spec: + split_input_and_parameters: no + # this should match one of the sections in the inference_parameters section above + parameter_set: bedrock + # to stream responses, set stream to true. Enter the start and stop token for the + # Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT) + # metrics to be calculated. The responses from bedrock stream is received in chunks, so mention + # the stop token only. + stream: True + start_token: + stop_token: "<|eot_id|>" + # runs are done for each combination of payload file and concurrency level + payload_files: + - payload_en_1-500.jsonl + - payload_en_500-1000.jsonl + - payload_en_1000-2000.jsonl + - payload_en_2000-3000.jsonl + - payload_en_3000-3840.jsonl + # concurrency level refers to number of requests sent in parallel to an endpoint + # the next set of requests is sent once responses for all concurrent requests have + # been received. + + # for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench + concurrency_levels: + - 1 + # Environment variables to be passed to the container + # this is not a fixed list, you can add more parameters as applicable. + env: + +report: + latency_budget: 2 + cost_per_10k_txn_budget: 100 + error_rate_budget: 0 + per_inference_request_file: per_inference_request_results.csv + all_metrics_file: all_metrics.csv + txn_count_for_showing_cost: 10000 + v_shift_w_single_instance: 0.025 + v_shift_w_gt_one_instance: 0.025 diff --git a/src/fmbench/configs/bedrock/config-bedrock-llama3-1.yml b/src/fmbench/configs/bedrock/config-bedrock-llama3-1.yml new file mode 100644 index 00000000..b18402a0 --- /dev/null +++ b/src/fmbench/configs/bedrock/config-bedrock-llama3-1.yml @@ -0,0 +1,262 @@ +general: + name: "fmbench-bedrock-llama3-1" + model_name: "Llama3-1-8b Model on Amazon Bedrock" + +# AWS and SageMaker settings +aws: + # AWS region, this parameter is templatized, no need to change + region: {region} + # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change + sagemaker_execution_role: {role_arn} + # S3 bucket to which metrics, plots and reports would be written to + bucket: {write_bucket} ## add the name of your desired bucket + +# directory paths in the write bucket, no need to change these +dir_paths: + data_prefix: data + prompts_prefix: prompts + all_prompts_file: all_prompts.csv + metrics_dir: metrics + models_dir: models + metadata_dir: metadata + +# S3 information for reading datasets, scripts and tokenizer +s3_read_data: + # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id + read_bucket: {read_bucket} + scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart + + # S3 prefix in the read bucket where deployment and inference scripts should be placed + scripts_prefix: scripts + + # deployment and inference script files to be downloaded are placed in this list + # only needed if you are creating a new deployment script or inference script + # your HuggingFace token does need to be in this list and should be called "hf_token.txt" + script_files: + - hf_token.txt + + # configuration files (like this one) are placed in this prefix + configs_prefix: configs + + # list of configuration files to download, for now only pricing.yml needs to be downloaded + config_files: + - pricing.yml + + # S3 prefix for the dataset files + source_data_prefix: source_data + # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench + source_data_files: + - 2wikimqa_e.jsonl + - 2wikimqa.jsonl + - hotpotqa_e.jsonl + - hotpotqa.jsonl + - narrativeqa.jsonl + - triviaqa_e.jsonl + - triviaqa.jsonl + + # S3 prefix for the tokenizer to be used with the models + # NOTE 1: the same tokenizer is used with all the models being tested through a config file + # NOTE 2: place your model specific tokenizers in a prefix named as _tokenizer + # so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer + tokenizer_prefix: llama3_1_tokenizer + + # S3 prefix for prompt templates + prompt_template_dir: prompt_template + + # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file + # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one + prompt_template_file: prompt_template_llama3.txt + +# steps to run, usually all of these would be +# set to yes so nothing needs to change here +# you could, however, bypass some steps for example +# set the 2_deploy_model.ipynb to no if you are re-running +# the same config file and the model is already deployed +run_steps: + 0_setup.ipynb: yes + 1_generate_data.ipynb: yes + 2_deploy_model.ipynb: no + 3_run_inference.ipynb: yes + 4_model_metric_analysis.ipynb: yes + 5_cleanup.ipynb: no + +datasets: + # dataset related configuration + prompt_template_keys: + - input + - context + + # if your dataset has multiple languages and it has a language + # field then you could filter it for a language. Similarly, + # you can filter your dataset to only keep prompts between + # a certain token length limit (the token length is determined + # using the tokenizer you provide in the tokenizer_prefix prefix in the + # read S3 bucket). Each of the array entries below create a payload file + # containing prompts matching the language and token length criteria. + filters: + - language: en + min_length_in_tokens: 1 + max_length_in_tokens: 500 + payload_file: payload_en_1-500.jsonl + - language: en + min_length_in_tokens: 500 + max_length_in_tokens: 1000 + payload_file: payload_en_500-1000.jsonl + - language: en + min_length_in_tokens: 1000 + max_length_in_tokens: 2000 + payload_file: payload_en_1000-2000.jsonl + - language: en + min_length_in_tokens: 2000 + max_length_in_tokens: 3000 + payload_file: payload_en_2000-3000.jsonl + - language: en + min_length_in_tokens: 3000 + max_length_in_tokens: 4000 + payload_file: payload_en_3000-4000.jsonl + - language: en + min_length_in_tokens: 3000 + max_length_in_tokens: 3840 + payload_file: payload_en_3000-3840.jsonl + +# While the tests would run on all the datasets +# configured in the experiment entries below but +# the price:performance analysis is only done for 1 +# dataset which is listed below as the dataset_of_interest +metrics: + dataset_of_interest: en_3000-3840 + +# all pricing information is in the pricing.yml file +# this file is provided in the repo. You can add entries +# to this file for new instance types and new Bedrock models +pricing: pricing.yml + +# inference parameters, these are added to the payload +# for each inference request. The list here is not static +# any parameter supported by the inference container can be +# added to the list. Put the sagemaker parameters in the sagemaker +# section, bedrock parameters in the bedrock section (not shown here). +# Use the section name (sagemaker in this example) in the inference_spec.parameter_set +# section under experiments. +inference_parameters: + bedrock: + temperature: 0.1 + max_tokens: 100 + top_p: 0.92 + caching: False + +# Configuration for experiments to be run. The experiments section is an array +# so more than one experiments can be added, these could belong to the same model +# but different instance types, or different models, or even different hosting +# options (such as one experiment is SageMaker and the other is Bedrock). +experiments: + - name: meta.llama3-1-70b-instruct-v1:0 + # model_id is interpreted in conjunction with the deployment_script, so if you + # use a JumpStart model id then set the deployment_script to jumpstart.py. + # if deploying directly from HuggingFace this would be a HuggingFace model id + # see the DJL serving deployment script in the code repo for reference. + model_id: meta.llama3-1-70b-instruct-v1:0 + model_version: + model_name: meta.llama3-1-70b-instruct-v1:0 + ep_name: meta.llama3-1-70b-instruct-v1:0 + instance_type: meta.llama3-1-70b-instruct-v1:0 + image_uri: + deploy: no + # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart + # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own. + # See repo for details + instance_count: + deployment_script: + # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker + # and Bedrock. You can also add your own. See repo for details + inference_script: bedrock_predictor.py + inference_spec: + split_input_and_parameters: no + # this should match one of the sections in the inference_parameters section above + parameter_set: bedrock + # to stream responses, set stream to true. Enter the start and stop token for the + # Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT) + # metrics to be calculated. The responses from bedrock stream is received in chunks, so mention + # the stop token only. + stream: False + start_token: + stop_token: "<|eot_id|>" + # runs are done for each combination of payload file and concurrency level + payload_files: + - payload_en_1-500.jsonl + - payload_en_500-1000.jsonl + - payload_en_1000-2000.jsonl + - payload_en_2000-3000.jsonl + - payload_en_3000-3840.jsonl + # concurrency level refers to number of requests sent in parallel to an endpoint + # the next set of requests is sent once responses for all concurrent requests have + # been received. + + # for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench + concurrency_levels: + - 1 + - 2 + - 4 + # Environment variables to be passed to the container + # this is not a fixed list, you can add more parameters as applicable. + env: + - name: meta.llama3-1-8b-instruct-v1:0 + # model_id is interpreted in conjunction with the deployment_script, so if you + # use a JumpStart model id then set the deployment_script to jumpstart.py. + # if deploying directly from HuggingFace this would be a HuggingFace model id + # see the DJL serving deployment script in the code repo for reference. + model_id: meta.llama3-1-8b-instruct-v1:0 + model_version: + model_name: meta.llama3-1-8b-instruct-v1:0 + ep_name: meta.llama3-1-8b-instruct-v1:0 + instance_type: meta.llama3-1-8b-instruct-v1:0 + image_uri: + deploy: no + # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart + # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own. + # See repo for details + instance_count: + deployment_script: + # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker + # and Bedrock. You can also add your own. See repo for details + inference_script: bedrock_predictor.py + inference_spec: + split_input_and_parameters: no + # this should match one of the sections in the inference_parameters section above + parameter_set: bedrock + # to stream responses, set stream to true. Enter the start and stop token for the + # Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT) + # metrics to be calculated. The responses from bedrock stream is received in chunks, so mention + # the stop token only. + stream: False + start_token: + stop_token: "<|eot_id|>" + # runs are done for each combination of payload file and concurrency level + payload_files: + - payload_en_1-500.jsonl + - payload_en_500-1000.jsonl + - payload_en_1000-2000.jsonl + - payload_en_2000-3000.jsonl + - payload_en_3000-3840.jsonl + # concurrency level refers to number of requests sent in parallel to an endpoint + # the next set of requests is sent once responses for all concurrent requests have + # been received. + + # for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench + concurrency_levels: + - 1 + - 2 + - 4 + # Environment variables to be passed to the container + # this is not a fixed list, you can add more parameters as applicable. + env: + +report: + latency_budget: 2 + cost_per_10k_txn_budget: 100 + error_rate_budget: 0 + per_inference_request_file: per_inference_request_results.csv + all_metrics_file: all_metrics.csv + txn_count_for_showing_cost: 10000 + v_shift_w_single_instance: 0.025 + v_shift_w_gt_one_instance: 0.025 diff --git a/src/fmbench/configs/pricing.yml b/src/fmbench/configs/pricing.yml index e1fa90ca..520c58c8 100644 --- a/src/fmbench/configs/pricing.yml +++ b/src/fmbench/configs/pricing.yml @@ -58,11 +58,11 @@ pricing: output-per-1k-tokens: 0.01500 # Titan amazon.titan-text-lite-v1: - input-per-1k-tokens: 0.0003 - output-per-1k-tokens: 0.0004 + input-per-1k-tokens: 0.00015 + output-per-1k-tokens: 0.0002 amazon.titan-text-express-v1: - input-per-1k-tokens: 0.0008 - output-per-1k-tokens: 0.0016 + input-per-1k-tokens: 0.0002 + output-per-1k-tokens: 0.0006 # Mistral & Mixtral mistral.mistral-7b-instruct-v0:2: input-per-1k-tokens: 0.00015 @@ -71,8 +71,17 @@ pricing: input-per-1k-tokens: 0.00045 output-per-1k-tokens: 0.0007 ## Llama + meta.llama3-1-8b-instruct-v1:0: + input-per-1k-tokens: 0.0003 + output-per-1k-tokens: 0.0006 + meta.llama3-1-70b-instruct-v1:0: + input-per-1k-tokens: 0.00265 + output-per-1k-tokens: 0.0035 + meta.llama3-1-405b-instruct-v1:0: + input-per-1k-tokens: 0.00532 + output-per-1k-tokens: 0.016 meta.llama3-8b-instruct-v1:0: - input-per-1k-tokens: 0.0004 + input-per-1k-tokens: 0.0003 output-per-1k-tokens: 0.0006 meta.llama3-70b-instruct-v1:0: input-per-1k-tokens: 0.00265