Merge pull request #156 from madhurprash/main

adding support for llama3.1 on Amazon Bedrock
aws-samples · Aug 6, 2024 · 04cc80e · 04cc80e
2 parents afe351d + d76fbdb
commit 04cc80e
Show file tree

Hide file tree

Showing 4 changed files with 696 additions and 5 deletions.
diff --git a/src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml b/src/fmbench/configs/bedrock/config-bedrock-llama3-1-70b-streaming.yml
@@ -0,0 +1,210 @@
+general:
+  name: "fmbench-bedrock-llama3-1"      
+  model_name: "Llama3-1-70b Model on Amazon Bedrock"
+
+# AWS and SageMaker settings
+aws:
+  # AWS region, this parameter is templatized, no need to change
+  region: {region}
+  # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
+  sagemaker_execution_role: {role_arn}
+  # S3 bucket to which metrics, plots and reports would be written to
+  bucket: {write_bucket} ## add the name of your desired bucket
+
+# directory paths in the write bucket, no need to change these
+dir_paths:
+  data_prefix: data
+  prompts_prefix: prompts
+  all_prompts_file: all_prompts.csv
+  metrics_dir: metrics
+  models_dir: models
+  metadata_dir: metadata
+
+# S3 information for reading datasets, scripts and tokenizer
+s3_read_data:
+  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-region-account_id
+  read_bucket: {read_bucket}
+  scripts_prefix: scripts ## add your own scripts in case you are using anything that is not on jumpstart
+
+  # S3 prefix in the read bucket where deployment and inference scripts should be placed
+  scripts_prefix: scripts
+
+  # deployment and inference script files to be downloaded are placed in this list
+  # only needed if you are creating a new deployment script or inference script
+  # your HuggingFace token does need to be in this list and should be called "hf_token.txt"
+  script_files:
+  - hf_token.txt
+
+  # configuration files (like this one) are placed in this prefix
+  configs_prefix: configs
+
+  # list of configuration files to download, for now only pricing.yml needs to be downloaded
+  config_files:
+  - pricing.yml
+
+  # S3 prefix for the dataset files
+  source_data_prefix: source_data
+  # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench
+  source_data_files:
+  - 2wikimqa_e.jsonl
+  - 2wikimqa.jsonl
+  - hotpotqa_e.jsonl
+  - hotpotqa.jsonl
+  - narrativeqa.jsonl
+  - triviaqa_e.jsonl
+  - triviaqa.jsonl
+
+  # S3 prefix for the tokenizer to be used with the models
+  # NOTE 1: the same tokenizer is used with all the models being tested through a config file
+  # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
+  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in  llama2_tokenizer
+  tokenizer_prefix: llama3_1_tokenizer
+
+  # S3 prefix for prompt templates
+  prompt_template_dir: prompt_template
+
+  # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file
+  # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one
+  prompt_template_file: prompt_template_llama3.txt
+
+# steps to run, usually all of these would be
+# set to yes so nothing needs to change here
+# you could, however, bypass some steps for example
+# set the 2_deploy_model.ipynb to no if you are re-running
+# the same config file and the model is already deployed
+run_steps:
+  0_setup.ipynb: yes
+  1_generate_data.ipynb: yes
+  2_deploy_model.ipynb: no
+  3_run_inference.ipynb: yes
+  4_model_metric_analysis.ipynb: yes
+  5_cleanup.ipynb: no
+
+datasets:
+  # dataset related configuration
+  prompt_template_keys:
+  - input
+  - context
+
+  # if your dataset has multiple languages and it has a language
+  # field then you could filter it for a language. Similarly,
+  # you can filter your dataset to only keep prompts between
+  # a certain token length limit (the token length is determined
+  # using the tokenizer you provide in the tokenizer_prefix prefix in the
+  # read S3 bucket). Each of the array entries below create a payload file
+  # containing prompts matching the language and token length criteria.
+  filters:
+  - language: en    
+    min_length_in_tokens: 1
+    max_length_in_tokens: 500
+    payload_file: payload_en_1-500.jsonl
+  - language: en
+    min_length_in_tokens: 500
+    max_length_in_tokens: 1000
+    payload_file: payload_en_500-1000.jsonl
+  - language: en
+    min_length_in_tokens: 1000
+    max_length_in_tokens: 2000
+    payload_file: payload_en_1000-2000.jsonl
+  - language: en
+    min_length_in_tokens: 2000
+    max_length_in_tokens: 3000
+    payload_file: payload_en_2000-3000.jsonl
+  - language: en
+    min_length_in_tokens: 3000
+    max_length_in_tokens: 4000
+    payload_file: payload_en_3000-4000.jsonl
+  - language: en
+    min_length_in_tokens: 3000
+    max_length_in_tokens: 3840
+    payload_file: payload_en_3000-3840.jsonl
+
+# While the tests would run on all the datasets
+# configured in the experiment entries below but 
+# the price:performance analysis is only done for 1
+# dataset which is listed below as the dataset_of_interest
+metrics:
+  dataset_of_interest: en_3000-3840
+
+# all pricing information is in the pricing.yml file
+# this file is provided in the repo. You can add entries
+# to this file for new instance types and new Bedrock models
+pricing: pricing.yml 
+
+# inference parameters, these are added to the payload
+# for each inference request. The list here is not static
+# any parameter supported by the inference container can be
+# added to the list. Put the sagemaker parameters in the sagemaker
+# section, bedrock parameters in the bedrock section (not shown here).
+# Use the section name (sagemaker in this example) in the inference_spec.parameter_set
+# section under experiments.
+inference_parameters: 
+  bedrock:
+    temperature: 0.1
+    max_tokens: 100
+    top_p: 0.92
+    caching: False
+
+# Configuration for experiments to be run. The experiments section is an array
+# so more than one experiments can be added, these could belong to the same model
+# but different instance types, or different models, or even different hosting
+# options (such as one experiment is SageMaker and the other is Bedrock).
+experiments:
+  - name: meta.llama3-1-70b-instruct-v1:0
+    # model_id is interpreted in conjunction with the deployment_script, so if you
+    # use a JumpStart model id then set the deployment_script to jumpstart.py.
+    # if deploying directly from HuggingFace this would be a HuggingFace model id
+    # see the DJL serving deployment script in the code repo for reference.    
+    model_id: meta.llama3-1-70b-instruct-v1:0
+    model_version: 
+    model_name: meta.llama3-1-70b-instruct-v1:0
+    ep_name: meta.llama3-1-70b-instruct-v1:0
+    instance_type: meta.llama3-1-70b-instruct-v1:0
+    image_uri:
+    deploy: no
+    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
+    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
+    # See repo for details
+    instance_count:
+    deployment_script:
+    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
+    # and Bedrock. You can also add your own. See repo for details
+    inference_script: bedrock_predictor.py
+    inference_spec:
+      split_input_and_parameters: no
+      # this should match one of the sections in the inference_parameters section above
+      parameter_set: bedrock
+      # to stream responses, set stream to true. Enter the start and stop token for the 
+      # Time To First Token, Time To Last Token, and Time Per Output Token (TTFT, TTLT, TPOT)
+      # metrics to be calculated. The responses from bedrock stream is received in chunks, so mention
+      # the stop token only.
+      stream: True
+      start_token:
+      stop_token: "<|eot_id|>"
+    # runs are done for each combination of payload file and concurrency level
+    payload_files:
+    - payload_en_1-500.jsonl
+    - payload_en_500-1000.jsonl
+    - payload_en_1000-2000.jsonl
+    - payload_en_2000-3000.jsonl
+    - payload_en_3000-3840.jsonl
+    # concurrency level refers to number of requests sent in parallel to an endpoint
+    # the next set of requests is sent once responses for all concurrent requests have
+    # been received.
+
+    # for streaming responses on bedrock, only a concurrency of 1 is supported on FMBench
+    concurrency_levels:
+    - 1
+    # Environment variables to be passed to the container
+    # this is not a fixed list, you can add more parameters as applicable.
+    env:
+
+report:
+  latency_budget: 2
+  cost_per_10k_txn_budget: 100
+  error_rate_budget: 0
+  per_inference_request_file: per_inference_request_results.csv
+  all_metrics_file: all_metrics.csv
+  txn_count_for_showing_cost: 10000
+  v_shift_w_single_instance: 0.025
+  v_shift_w_gt_one_instance: 0.025