-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #156 from madhurprash/main
adding support for llama3.1 on Amazon Bedrock
- Loading branch information
Showing
4 changed files
with
696 additions
and
5 deletions.
There are no files selected for viewing
210 changes: 210 additions & 0 deletions
210
src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
general: | ||
name: "fmbench-bedrock-llama3-1" | ||
model_name: "Llama3-1-70b Model on Amazon Bedrock" | ||
|
||
# AWS and SageMaker settings | ||
aws: | ||
# AWS region, this parameter is templatized, no need to change | ||
region: {region} | ||
# SageMaker execution role used to run FMBench, this parameter is templatized, no need to change | ||
sagemaker_execution_role: {role_arn} | ||
# S3 bucket to which metrics, plots and reports would be written to | ||
bucket: {write_bucket} ## add the name of your desired bucket | ||
|
||
# directory paths in the write bucket, no need to change these | ||
dir_paths: | ||
data_prefix: data | ||
prompts_prefix: prompts | ||
all_prompts_file: all_prompts.csv | ||
metrics_dir: metrics | ||
models_dir: models | ||
metadata_dir: metadata | ||
|
||
# S3 information for reading datasets, scripts and tokenizer | ||
s3_read_data: | ||
# read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id | ||
read_bucket: {read_bucket} | ||
scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart | ||
|
||
# S3 prefix in the read bucket where deployment and inference scripts should be placed | ||
scripts_prefix: scripts | ||
|
||
# deployment and inference script files to be downloaded are placed in this list | ||
# only needed if you are creating a new deployment script or inference script | ||
# your HuggingFace token does need to be in this list and should be called "hf_token.txt" | ||
script_files: | ||
- hf_token.txt | ||
|
||
# configuration files (like this one) are placed in this prefix | ||
configs_prefix: configs | ||
|
||
# list of configuration files to download, for now only pricing.yml needs to be downloaded | ||
config_files: | ||
- pricing.yml | ||
|
||
# S3 prefix for the dataset files | ||
source_data_prefix: source_data | ||
# list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench | ||
source_data_files: | ||
- 2wikimqa_e.jsonl | ||
- 2wikimqa.jsonl | ||
- hotpotqa_e.jsonl | ||
- hotpotqa.jsonl | ||
- narrativeqa.jsonl | ||
- triviaqa_e.jsonl | ||
- triviaqa.jsonl | ||
|
||
# S3 prefix for the tokenizer to be used with the models | ||
# NOTE 1: the same tokenizer is used with all the models being tested through a config file | ||
# NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer | ||
# so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer | ||
tokenizer_prefix: llama3_1_tokenizer | ||
|
||
# S3 prefix for prompt templates | ||
prompt_template_dir: prompt_template | ||
|
||
# prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file | ||
# the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one | ||
prompt_template_file: prompt_template_llama3.txt | ||
|
||
# steps to run, usually all of these would be | ||
# set to yes so nothing needs to change here | ||
# you could, however, bypass some steps for example | ||
# set the 2_deploy_model.ipynb to no if you are re-running | ||
# the same config file and the model is already deployed | ||
run_steps: | ||
0_setup.ipynb: yes | ||
1_generate_data.ipynb: yes | ||
2_deploy_model.ipynb: no | ||
3_run_inference.ipynb: yes | ||
4_model_metric_analysis.ipynb: yes | ||
5_cleanup.ipynb: no | ||
|
||
datasets: | ||
# dataset related configuration | ||
prompt_template_keys: | ||
- input | ||
- context | ||
|
||
# if your dataset has multiple languages and it has a language | ||
# field then you could filter it for a language. Similarly, | ||
# you can filter your dataset to only keep prompts between | ||
# a certain token length limit (the token length is determined | ||
# using the tokenizer you provide in the tokenizer_prefix prefix in the | ||
# read S3 bucket). Each of the array entries below create a payload file | ||
# containing prompts matching the language and token length criteria. | ||
filters: | ||
- language: en | ||
min_length_in_tokens: 1 | ||
max_length_in_tokens: 500 | ||
payload_file: payload_en_1-500.jsonl | ||
- language: en | ||
min_length_in_tokens: 500 | ||
max_length_in_tokens: 1000 | ||
payload_file: payload_en_500-1000.jsonl | ||
- language: en | ||
min_length_in_tokens: 1000 | ||
max_length_in_tokens: 2000 | ||
payload_file: payload_en_1000-2000.jsonl | ||
- language: en | ||
min_length_in_tokens: 2000 | ||
max_length_in_tokens: 3000 | ||
payload_file: payload_en_2000-3000.jsonl | ||
- language: en | ||
min_length_in_tokens: 3000 | ||
max_length_in_tokens: 4000 | ||
payload_file: payload_en_3000-4000.jsonl | ||
- language: en | ||
min_length_in_tokens: 3000 | ||
max_length_in_tokens: 3840 | ||
payload_file: payload_en_3000-3840.jsonl | ||
|
||
# While the tests would run on all the datasets | ||
# configured in the experiment entries below but | ||
# the price:performance analysis is only done for 1 | ||
# dataset which is listed below as the dataset_of_interest | ||
metrics: | ||
dataset_of_interest: en_3000-3840 | ||
|
||
# all pricing information is in the pricing.yml file | ||
# this file is provided in the repo. You can add entries | ||
# to this file for new instance types and new Bedrock models | ||
pricing: pricing.yml | ||
|
||
# inference parameters, these are added to the payload | ||
# for each inference request. The list here is not static | ||
# any parameter supported by the inference container can be | ||
# added to the list. Put the sagemaker parameters in the sagemaker | ||
# section, bedrock parameters in the bedrock section (not shown here). | ||
# Use the section name (sagemaker in this example) in the inference_spec.parameter_set | ||
# section under experiments. | ||
inference_parameters: | ||
bedrock: | ||
temperature: 0.1 | ||
max_tokens: 100 | ||
top_p: 0.92 | ||
caching: False | ||
|
||
# Configuration for experiments to be run. The experiments section is an array | ||
# so more than one experiments can be added, these could belong to the same model | ||
# but different instance types, or different models, or even different hosting | ||
# options (such as one experiment is SageMaker and the other is Bedrock). | ||
experiments: | ||
- name: meta.llama3-1-70b-instruct-v1:0 | ||
# model_id is interpreted in conjunction with the deployment_script, so if you | ||
# use a JumpStart model id then set the deployment_script to jumpstart.py. | ||
# if deploying directly from HuggingFace this would be a HuggingFace model id | ||
# see the DJL serving deployment script in the code repo for reference. | ||
model_id: meta.llama3-1-70b-instruct-v1:0 | ||
model_version: | ||
model_name: meta.llama3-1-70b-instruct-v1:0 | ||
ep_name: meta.llama3-1-70b-instruct-v1:0 | ||
instance_type: meta.llama3-1-70b-instruct-v1:0 | ||
image_uri: | ||
deploy: no | ||
# FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart | ||
# scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own. | ||
# See repo for details | ||
instance_count: | ||
deployment_script: | ||
# FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker | ||
# and Bedrock. You can also add your own. See repo for details | ||
inference_script: bedrock_predictor.py | ||
inference_spec: | ||
split_input_and_parameters: no | ||
# this should match one of the sections in the inference_parameters section above | ||
parameter_set: bedrock | ||
# to stream responses, set stream to true. Enter the start and stop token for the | ||
# Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT) | ||
# metrics to be calculated. The responses from bedrock stream is received in chunks, so mention | ||
# the stop token only. | ||
stream: True | ||
start_token: | ||
stop_token: "<|eot_id|>" | ||
# runs are done for each combination of payload file and concurrency level | ||
payload_files: | ||
- payload_en_1-500.jsonl | ||
- payload_en_500-1000.jsonl | ||
- payload_en_1000-2000.jsonl | ||
- payload_en_2000-3000.jsonl | ||
- payload_en_3000-3840.jsonl | ||
# concurrency level refers to number of requests sent in parallel to an endpoint | ||
# the next set of requests is sent once responses for all concurrent requests have | ||
# been received. | ||
|
||
# for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench | ||
concurrency_levels: | ||
- 1 | ||
# Environment variables to be passed to the container | ||
# this is not a fixed list, you can add more parameters as applicable. | ||
env: | ||
|
||
report: | ||
latency_budget: 2 | ||
cost_per_10k_txn_budget: 100 | ||
error_rate_budget: 0 | ||
per_inference_request_file: per_inference_request_results.csv | ||
all_metrics_file: all_metrics.csv | ||
txn_count_for_showing_cost: 10000 | ||
v_shift_w_single_instance: 0.025 | ||
v_shift_w_gt_one_instance: 0.025 |
Oops, something went wrong.