Skip to content

Commit

Permalink
Merge pull request #156 from madhurprash/main
Browse files Browse the repository at this point in the history
adding support for llama3.1 on Amazon Bedrock
  • Loading branch information
aarora79 committed Aug 6, 2024
2 parents afe351d + d76fbdb commit 04cc80e
Show file tree
Hide file tree
Showing 4 changed files with 696 additions and 5 deletions.
210 changes: 210 additions & 0 deletions src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
general:
name: "fmbench-bedrock-llama3-1"
model_name: "Llama3-1-70b Model on Amazon Bedrock"

# AWS and SageMaker settings
aws:
# AWS region, this parameter is templatized, no need to change
region: {region}
# SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
sagemaker_execution_role: {role_arn}
# S3 bucket to which metrics, plots and reports would be written to
bucket: {write_bucket} ## add the name of your desired bucket

# directory paths in the write bucket, no need to change these
dir_paths:
data_prefix: data
prompts_prefix: prompts
all_prompts_file: all_prompts.csv
metrics_dir: metrics
models_dir: models
metadata_dir: metadata

# S3 information for reading datasets, scripts and tokenizer
s3_read_data:
# read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id
read_bucket: {read_bucket}
scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart

# S3 prefix in the read bucket where deployment and inference scripts should be placed
scripts_prefix: scripts

# deployment and inference script files to be downloaded are placed in this list
# only needed if you are creating a new deployment script or inference script
# your HuggingFace token does need to be in this list and should be called "hf_token.txt"
script_files:
- hf_token.txt

# configuration files (like this one) are placed in this prefix
configs_prefix: configs

# list of configuration files to download, for now only pricing.yml needs to be downloaded
config_files:
- pricing.yml

# S3 prefix for the dataset files
source_data_prefix: source_data
# list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench
source_data_files:
- 2wikimqa_e.jsonl
- 2wikimqa.jsonl
- hotpotqa_e.jsonl
- hotpotqa.jsonl
- narrativeqa.jsonl
- triviaqa_e.jsonl
- triviaqa.jsonl

# S3 prefix for the tokenizer to be used with the models
# NOTE 1: the same tokenizer is used with all the models being tested through a config file
# NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
# so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer
tokenizer_prefix: llama3_1_tokenizer

# S3 prefix for prompt templates
prompt_template_dir: prompt_template

# prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file
# the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one
prompt_template_file: prompt_template_llama3.txt

# steps to run, usually all of these would be
# set to yes so nothing needs to change here
# you could, however, bypass some steps for example
# set the 2_deploy_model.ipynb to no if you are re-running
# the same config file and the model is already deployed
run_steps:
0_setup.ipynb: yes
1_generate_data.ipynb: yes
2_deploy_model.ipynb: no
3_run_inference.ipynb: yes
4_model_metric_analysis.ipynb: yes
5_cleanup.ipynb: no

datasets:
# dataset related configuration
prompt_template_keys:
- input
- context

# if your dataset has multiple languages and it has a language
# field then you could filter it for a language. Similarly,
# you can filter your dataset to only keep prompts between
# a certain token length limit (the token length is determined
# using the tokenizer you provide in the tokenizer_prefix prefix in the
# read S3 bucket). Each of the array entries below create a payload file
# containing prompts matching the language and token length criteria.
filters:
- language: en
min_length_in_tokens: 1
max_length_in_tokens: 500
payload_file: payload_en_1-500.jsonl
- language: en
min_length_in_tokens: 500
max_length_in_tokens: 1000
payload_file: payload_en_500-1000.jsonl
- language: en
min_length_in_tokens: 1000
max_length_in_tokens: 2000
payload_file: payload_en_1000-2000.jsonl
- language: en
min_length_in_tokens: 2000
max_length_in_tokens: 3000
payload_file: payload_en_2000-3000.jsonl
- language: en
min_length_in_tokens: 3000
max_length_in_tokens: 4000
payload_file: payload_en_3000-4000.jsonl
- language: en
min_length_in_tokens: 3000
max_length_in_tokens: 3840
payload_file: payload_en_3000-3840.jsonl

# While the tests would run on all the datasets
# configured in the experiment entries below but
# the price:performance analysis is only done for 1
# dataset which is listed below as the dataset_of_interest
metrics:
dataset_of_interest: en_3000-3840

# all pricing information is in the pricing.yml file
# this file is provided in the repo. You can add entries
# to this file for new instance types and new Bedrock models
pricing: pricing.yml

# inference parameters, these are added to the payload
# for each inference request. The list here is not static
# any parameter supported by the inference container can be
# added to the list. Put the sagemaker parameters in the sagemaker
# section, bedrock parameters in the bedrock section (not shown here).
# Use the section name (sagemaker in this example) in the inference_spec.parameter_set
# section under experiments.
inference_parameters:
bedrock:
temperature: 0.1
max_tokens: 100
top_p: 0.92
caching: False

# Configuration for experiments to be run. The experiments section is an array
# so more than one experiments can be added, these could belong to the same model
# but different instance types, or different models, or even different hosting
# options (such as one experiment is SageMaker and the other is Bedrock).
experiments:
- name: meta.llama3-1-70b-instruct-v1:0
# model_id is interpreted in conjunction with the deployment_script, so if you
# use a JumpStart model id then set the deployment_script to jumpstart.py.
# if deploying directly from HuggingFace this would be a HuggingFace model id
# see the DJL serving deployment script in the code repo for reference.
model_id: meta.llama3-1-70b-instruct-v1:0
model_version:
model_name: meta.llama3-1-70b-instruct-v1:0
ep_name: meta.llama3-1-70b-instruct-v1:0
instance_type: meta.llama3-1-70b-instruct-v1:0
image_uri:
deploy: no
# FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
# scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
# See repo for details
instance_count:
deployment_script:
# FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
# and Bedrock. You can also add your own. See repo for details
inference_script: bedrock_predictor.py
inference_spec:
split_input_and_parameters: no
# this should match one of the sections in the inference_parameters section above
parameter_set: bedrock
# to stream responses, set stream to true. Enter the start and stop token for the
# Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT)
# metrics to be calculated. The responses from bedrock stream is received in chunks, so mention
# the stop token only.
stream: True
start_token:
stop_token: "<|eot_id|>"
# runs are done for each combination of payload file and concurrency level
payload_files:
- payload_en_1-500.jsonl
- payload_en_500-1000.jsonl
- payload_en_1000-2000.jsonl
- payload_en_2000-3000.jsonl
- payload_en_3000-3840.jsonl
# concurrency level refers to number of requests sent in parallel to an endpoint
# the next set of requests is sent once responses for all concurrent requests have
# been received.

# for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench
concurrency_levels:
- 1
# Environment variables to be passed to the container
# this is not a fixed list, you can add more parameters as applicable.
env:

report:
latency_budget: 2
cost_per_10k_txn_budget: 100
error_rate_budget: 0
per_inference_request_file: per_inference_request_results.csv
all_metrics_file: all_metrics.csv
txn_count_for_showing_cost: 10000
v_shift_w_single_instance: 0.025
v_shift_w_gt_one_instance: 0.025
Loading

0 comments on commit 04cc80e

Please sign in to comment.