AI-Hypercomputer
diff --git a/‎end_to_end/gpu/a3/test_llama2_7b.sh
Lines changed: 71 additions & 0 deletions b/‎end_to_end/gpu/a3/test_llama2_7b.sh
Lines changed: 71 additions & 0 deletions
diff --git a/‎end_to_end/tpu/eval_assert.py
Lines changed: 139 additions & 0 deletions b/‎end_to_end/tpu/eval_assert.py
Lines changed: 139 additions & 0 deletions
diff --git a/‎end_to_end/tpu/gemma/2b/test_gemma.sh
Lines changed: 64 additions & 0 deletions b/‎end_to_end/tpu/gemma/2b/test_gemma.sh
Lines changed: 64 additions & 0 deletions
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# This file is both an integration test that runs once a day on a A3 and documentation for how to get started with Llama2-7b
+
+# The flow of this file is as follows:
+# 1. Download the checkpoint from Meta (https://llama.meta.com/llama-downloads/) in your local directory. Convert this PyTorch checkpoint into Orbax checkpoint format for use in MaxText.
+# 2. Run training of Llama2-7b.
+# 3. Run decoding from the trained checkpoint.
+
+
+set -ex
+idx=$(date +%Y-%m-%d-%H-%M)
+
+# Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to a GCS bucket that you own, this bucket will store all the files generated by MaxText during a run
+export BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs
+export ASYNC_CHECKPOINTING=false
+
+# We install torch CPU because the checkpoint conversion script MaxText/llama_or_mistral_ckpt.py does not need a TPU/GPU
+pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+# We define a var for the path to the Meta checkpoint. Non-Googlers please remember to update the source `META_CHECKPOINT_PATH` to the GCS bucket where you have your Meta checkpoint
+export META_CHECKPOINT_PATH=gs://maxtext-llama/llama2-7b/meta-ckpt
+
+# In the following command, we are copying Meta's checkpoint into a local directory `tmp`.
+# You can use a different local directory than /tmp/, if you do so, please use the same local path for `base-model-path` when running `python3 MaxText/llama_or_mistral_ckpt.py`
+gcloud storage cp -r ${META_CHECKPOINT_PATH} /tmp/
+
+# `CONVERTED_CHECKPOINT_PATH` is the path to the GCS bucket where we want to save our converted (Orbax) checkpoint. Non-Googlers please remember to point `CONVERTED_CHECKPOINT_PATH` to a GCS bucket that you own
+export CONVERTED_CHECKPOINT_PATH=gs://maxtext-llama/test/${idx}/decode-ckpt-maxtext-gpu
+
+#Next, run the conversion script `MaxText/llama_or_mistral_ckpt.py` to convert Meta's PyTorch checkpoint in `base-model-path` and save the new converted (Orbax) checkpoint in the `maxtext-model-path`
+python3 MaxText/llama_or_mistral_ckpt.py --base-model-path /tmp/meta-ckpt --model-size llama2-7b --maxtext-model-path ${CONVERTED_CHECKPOINT_PATH}
+
+# We define `CONVERTED_CHECKPOINT` to refer to the checkpoint subdirectory exactly inside `CONVERTED_CHECKPOINT_PATH`. This way it is easier to use this path in the `train.py` and `decode.py` commands
+export CONVERTED_CHECKPOINT=${CONVERTED_CHECKPOINT_PATH}/0/items
+
+# Note that the `CONVERTED_CHECKPOINT` is in a `scanned` format which is great for training but for efficient decoding performance we want the checkpoint in an `unscanned` format.
+# We can do this by running `MaxText/generate_param_only_checkpoint.py` on `CONVERTED_CHECKPOINT` with `force_unroll=true`.
+export DIRECT_PARAMETER_CHECKPOINT_RUN=direct_generate_param_only_checkpoint_${idx}
+python3 MaxText/generate_param_only_checkpoint.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} load_parameters_path=${CONVERTED_CHECKPOINT} run_name=${DIRECT_PARAMETER_CHECKPOINT_RUN} model_name='llama2-7b' hardware=gpu async_checkpointing=${ASYNC_CHECKPOINTING}
+
+export RUN_NAME="llama-2-1vm-$(date +%Y-%m-%d-%H-%M)"
+
+# Set environment variables
+for ARGUMENT in "$@"; do
+    IFS='=' read -r KEY VALUE <<< "$ARGUMENT"
+    export "$KEY"="$VALUE"
+done
+
+export XLA_PYTHON_CLIENT_MEM_FRACTION=0.85
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_FUSED_ATTN=1
+export NCCL_DEBUG=VERSION
+
+export XLA_FLAGS="--xla_dump_to=$BASE_OUTPUT_PATH/$RUN_NAME/HLO_dumps/
+--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true
+--xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions
+ --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true
+ --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728
+ --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
+ --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
+ --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false
+ --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+ --xla_disable_hlo_passes=rematerialization"
+
+python MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME hardware=gpu steps=30 dcn_data_parallelism=1 ici_fsdp_parallelism=8 per_device_batch_size=4 max_target_length=4096 model_name=llama2-7b enable_checkpointing=true attention=cudnn_flash_te remat_policy=minimal_flash use_iota_embed=true scan_layers=false dataset_type=synthetic async_checkpointing=${ASYNC_CHECKPOINTING} base_output_directory=$BASE_OUTPUT_DIRECTORY enable_profiler=false
+
+export XLA_PYTHON_CLIENT_MEM_FRACTION=0.65
+export TF_FORCE_GPU_ALLOW_GROWTH=true
+
+python3 MaxText/decode.py MaxText/configs/base.yml load_parameters_path=${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints/0/items run_name=runner_decode_finetuned_${idx} base_output_directory=${BASE_OUTPUT_DIRECTORY} per_device_batch_size=1 model_name='llama2-7b' ici_autoregressive_parallelism=4 max_prefill_predict_length=4  max_target_length=16 prompt="I love to" attention=dot_product scan_layers=false hardware=gpu async_checkpointing=${ASYNC_CHECKPOINTING}
@@ -0,0 +1,139 @@
+"""
+ Copyright 2023 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ """
+
+# pylint: skip-file
+"""Reads and asserts over target values"""
+from absl import app
+from typing import Sequence
+from math import isclose
+from google.cloud import storage
+import json
+
+
+def compute_avg_metric(metrics_file, target, start_line=10):
+  """ Reads and computes average of target value
+  If start_line is negative then uses the last lines, e.g. start from end + 1 - |start_line|"""
+  
+
+  avg = 0
+  i = 0
+  with open(metrics_file, 'r', encoding='utf8') as file:
+    lines = file.readlines()
+    if start_line < 0:
+      start_line = len(lines) + start_line
+    for line in lines:
+      # skip the first start_line lines for burn in
+      if i >= start_line:
+        vals = json.loads(line)
+        avg += vals[target]
+      i+=1
+    avg /= (i-start_line)
+
+  return avg
+
+
+def assert_metric_average(metrics_file, threshold, target):
+  avg_value = compute_avg_metric(metrics_file, target)
+  # Checks for acceptable performance by asserting that the average metric (e.g. TFLOPs)
+  # is greater than the threshold.
+  print(f'avg value of target {target} is {avg_value}')
+  assert avg_value >= float(threshold)
+  print('assert metric average passed.')
+
+def test_final_loss(metrics_file, target_loss):
+  target_loss = float(target_loss)
+  with open(metrics_file, 'r', encoding='utf8') as metrics:
+    use_last_n_data = 10
+    avg_final_loss = compute_avg_metric(metrics_file, 'learning/loss', start_line= -1 * use_last_n_data)
+    print(f"Mean of last {use_last_n_data} losses is {avg_final_loss}")
+    print(f"Target loss is {target_loss}")
+    assert avg_final_loss < target_loss
+    print('Final loss test passed.')
+
+def test_checkpointing(metrics_file, target, dataset_type):
+  """Asserts over loss values from loaded checkpoint"""
+  metrics_file_saved = 'saved_' + metrics_file
+  metrics_file_restored = 'restored_' + metrics_file
+
+  with open(metrics_file_saved, 'r', encoding='utf8') as saved,\
+    open(metrics_file_restored, 'r', encoding='utf8') as restored:
+    saved_loss = json.loads(saved.readlines()[-1])[target]
+    restored_loss = json.loads(restored.readlines()[0])[target]
+    # Checks that checkpoint restore was successful by comparing loss of last
+    # step in saved checkpoint to loss of first step in restored checkpoint
+    print("saved loss: ", saved_loss)
+    print("restored loss: ", restored_loss)
+    if dataset_type=='c4':
+      assert isclose(saved_loss, restored_loss, rel_tol=0.1)
+    elif dataset_type=='c4-array_record':
+      assert saved_loss==restored_loss
+    else:
+      raise ValueError(f"Unknown dataset_type {dataset_type}. dataset_type must be c4, c4-array_record or synthetic")
+    print('checkpointing test passed.')
+
+def test_determinism(metrics_file, target):
+  """Asserts over loss values from two runs"""
+  run_1 = 'run_1_' + metrics_file
+  run_2 = 'run_2_' + metrics_file
+
+  with open(run_1, 'r', encoding='utf8') as run_1_file,\
+    open(run_2, 'r', encoding='utf8') as run_2_file:
+    run_1_loss = json.loads(run_1_file.readlines()[-1])[target]
+    run_2_loss = json.loads(run_2_file.readlines()[-1])[target]
+    # Check that the two runs have the same loss
+    print(f"Run 1 loss:{run_1_loss}", flush=True)
+    print(f"Run 2 loss:{run_2_loss}", flush=True)
+    assert run_1_loss==run_2_loss
+    print('determinism test passed.')
+
+def test_vocab_creation(target):
+  bucket_name = target.split("/")[2]
+  vocab_path = "/".join(target.split("/")[3:])
+  storage_client = storage.Client()
+  assert storage.Blob(bucket=storage_client.bucket(bucket_name), name=vocab_path).exists(storage_client)
+  print('vocab creation test passed.')
+
+def test_start_step(metrics_file, start_step_target):
+  with open(metrics_file, 'r', encoding='utf8') as metrics:
+    start_step = json.loads(metrics.readlines()[0])["step"]
+  print(f"Start step is {start_step}, start step target is {start_step_target}")
+  assert start_step==float(start_step_target)
+  print("Start step test passed.")
+
+def main(argv: Sequence[str]) -> None:
+
+  _, test_scenario, *test_vars = argv
+
+  if test_scenario == 'metrics_average':
+    assert_metric_average(*test_vars)
+  elif test_scenario == 'checkpoint_save_restore':
+    test_checkpointing(*test_vars, dataset_type='c4')
+  elif test_scenario == 'grain_checkpoint_save_restore':
+    test_checkpointing(*test_vars, dataset_type='c4-array_record')
+  elif test_scenario == 'determinism':
+    test_determinism(*test_vars)
+  elif test_scenario == 'vocab_creation':
+    test_vocab_creation(*test_vars)
+  elif test_scenario == 'final_loss':
+    test_final_loss(*test_vars)
+  elif test_scenario == 'test_start_step':
+    test_start_step(*test_vars)
+  else:
+     raise ValueError(f"Unrecognized test_scenario {test_scenario}")
+
+
+if __name__ == "__main__":
+  app.run(main)
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This file is both an integration test that runs once a day on a v4-8 and documentation for how to get started with Gemma-2b. 
+
+# The flow of this file is as follows:
+# 1. Convert the checkpoint downloaded from Kaggle to make it compatible with MaxText
+# 2. Run decoding, finetuning of Gemma 2B with the converted checkpoint. Also, run pretraining of Gemma 2B
+# 3. Convert the scanned checkpoint from step 1 into unscanned checkpoint format and run more efficient decoding.
+# 4. Run decoding from the finetuned checkpoint from step 2
+# 5. Ahead of Time Compilation for running Gemma 2B on v5e-256
+
+
+set -ex
+idx=$(date +%Y-%m-%d-%H-%M)
+export MODEL_VARIATION='2b'
+
+# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET \
+# Non-Googlers please remember to use seperate GCS paths for uploading model weights from kaggle ($CHKPT_BUCKET) and MaxText compatible weights ($MODEL_BUCKET).
+# Non-Googlers please remember to point these variables to GCS buckets that you own, this script uses internal buckets for testing.
+export CHKPT_BUCKET=gs://maxtext-gemma/flax
+export MODEL_BUCKET=gs://maxtext-gemma
+python MaxText/convert_gemma_chkpt.py --base_model_path ${CHKPT_BUCKET}/${MODEL_VARIATION} --maxtext_model_path ${MODEL_BUCKET}/${MODEL_VARIATION}/${idx} --model_size ${MODEL_VARIATION}
+
+# Non-Googlers please remember to point `DATASET_PATH` to the GCS bucket where you have your training data
+export DATASET_PATH=gs://maxtext-dataset
+# Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to a GCS bucket that you own, this bucket will store all the files generated by MaxText during a run
+export BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs
+# We define `CONVERTED_CHECKPOINT` to refer to the checkpoint subdirectory. This way it is easier to use this path in the `train.py` and `decode.py` commands
+export CONVERTED_CHECKPOINT=${MODEL_BUCKET}/${MODEL_VARIATION}/${idx}/0/items
+export RUN_NAME=unscanned_chkpt_${idx}
+# Note that the `CONVERTED_CHECKPOINT` is in a `scanned` format which is great for training but for efficient decoding performance we want the checkpoint in an `unscanned` format.
+# We can do this by running `MaxText/generate_param_only_checkpoint.py` on `CONVERTED_CHECKPOINT` with `force_unroll=true`. 
+python MaxText/generate_param_only_checkpoint.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} load_parameters_path=${CONVERTED_CHECKPOINT} run_name=${RUN_NAME} model_name='gemma-2b' force_unroll=true
+
+export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_DIRECTORY}/${RUN_NAME}/checkpoints/0/items
+
+# We run decoding on the `UNSCANNED_CKPT_PATH` for efficient decoding on the unscanned version of the checkpoint. Note that this checkpoint only has parameters and no optimizer state. 
+# So, we use it by specifying`load_parameters_path=${CONVERTED_CHECKPOINT}`
+# We compare our decoded results by asserting with golden outputs using `autoregressive_decode_assert`
+python MaxText/decode.py MaxText/configs/base.yml tokenizer_path=assets/tokenizer.gemma load_parameters_path=${UNSCANNED_CKPT_PATH} per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false model_name=gemma-2b attention=dot_product prompt="I love to" autoregressive_decode_assert=" travel and I love to write about it"
+
+# We can also run decoding (albeit in a bit unoptimized way) by using the scanned converted checkpoint located at `CONVERTED_CHECKPOINT`. Note again that this checkpoint only has parameters and no optimizer state. So, we use it by specifying`load_parameters_path=${CONVERTED_CHECKPOINT}`
+python MaxText/decode.py MaxText/configs/base.yml tokenizer_path=assets/tokenizer.gemma load_parameters_path=${CONVERTED_CHECKPOINT} per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false model_name=gemma-2b attention=dot_product prompt="I love to" autoregressive_decode_assert=" cook and bake. I love to eat"
+
+# Alternatively, we skip to running finetuning by using the scanned converted checkpoint located at `CONVERTED_CHECKPOINT`. Again, we use it by specifying`load_parameters_path=${CONVERTED_CHECKPOINT}`. Note that scanned checkpoint helps with efficient finetuning
+export FINETUNE_RUN_NAME=runner_finetune_${idx}
+python MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} tokenizer_path=assets/tokenizer.gemma load_parameters_path=${CONVERTED_CHECKPOINT} per_device_batch_size=1 run_name=${FINETUNE_RUN_NAME} max_target_length=8192 steps=10 async_checkpointing=false model_name=gemma-2b checkpoint_period=5
+
+# We also run pre-training, this is similar to the finetuning command except we don't pass any checkpoint directory to load parameters from
+python MaxText/train.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} tokenizer_path=assets/tokenizer.gemma per_device_batch_size=1 run_name=runner_pretrain_${idx} max_target_length=8192 steps=5 enable_checkpointing=false model_name=gemma-2b
+
+# Note that the finetune run checkpoint generates the `full state` which has both parameters and optimizer state. For decoding, we only need to use the parameters. 
+# So, we can use the `MaxText/generate_param_only_checkpoint.py` to convert the full state checkpoint into a parameter only checkpoint for more efficient memory use. Note that the path provided to the flag `load_full_state_path` is the path to the checkpoint subdirectory inside the `BASE_OUTPUT_DIRECTORY` from our previous finetuning run.
+# `force_unroll=true` is converting the output parameter only checkpoint into an unscanned format for efficient decoding
+export PARAM_RUN_NAME=param_chkpt_${idx}
+python MaxText/generate_param_only_checkpoint.py MaxText/configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} load_full_state_path=${BASE_OUTPUT_DIRECTORY}/${FINETUNE_RUN_NAME}/checkpoints/5/items run_name=${PARAM_RUN_NAME} model_name='gemma-2b' force_unroll=true
+
+# Now, run decoding on the checkpoint generated from our finetune run.
+python MaxText/decode.py MaxText/configs/base.yml tokenizer_path=assets/tokenizer.gemma load_parameters_path=${BASE_OUTPUT_DIRECTORY}/${PARAM_RUN_NAME}/checkpoints/0/items per_device_batch_size=1 run_name=runner_$(date +%Y-%m-%d-%H-%M) max_prefill_predict_length=8 max_target_length=16 dataset_type=synthetic steps=10 async_checkpointing=false scan_layers=false model_name=gemma-2b attention=dot_product prompt="I love to"
+
+# We recommend training/finetuning Gemma on v5e-256 using the following sharding strategy to achieve optimal performance.
+# This below command does Ahead Of Time Cross Compilation (https://github.com/google/maxtext?tab=readme-ov-file#ahead-of-time-compilation-aot) for our recommended v5e-256 configuration for Gemma 2B.
+# To actually run it on real v5e-256's simple replace the train_compile.py with a train.py and get rid of compile_topology args.
+python MaxText/train_compile.py MaxText/configs/base.yml model_name=gemma-2b ici_fsdp_transpose_parallelism=16 per_device_batch_size=2 compile_topology=v5e-256 compile_topology_num_slices=1