diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index cd1cafe8e6799..ea8d2f5f806f8 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/cpu-torch-latest.yml b/.github/workflows/cpu-torch-latest.yml
index 213421590ad6e..bb2b002b1a174 100644
--- a/.github/workflows/cpu-torch-latest.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4
@@ -50,5 +50,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.3"
-          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.3"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index d2554b7c0038c..e33da160aaf3b 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -18,7 +18,7 @@ jobs:
 
   # formatting and basic install on cpu-only machine
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
index f81e690e835bb..4e9ceb32b6b16 100644
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -39,13 +39,14 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+      image: vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
 
     env:
       PT_HPU_LAZY_MODE: 0
+      TORCHINDUCTOR_COMPILE_THREADS: 1
       TEST_LIST: |
         test_accelerator.py
         test_autotuning.py
@@ -67,7 +68,6 @@ jobs:
         (test_flops_profiler.py and test_flops_profiler_in_inference)
         test_get_optim_files.py
         test_groups.py
-        test_init_on_device.py
         test_partition_balanced.py
         (test_adamw.py and TestAdamConfigs)
         test_coalesced_collectives.py
@@ -103,7 +103,7 @@ jobs:
       - name: Check container state
         run: |
           ldd --version
-          hl-smi
+          hl-smi -L
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -128,7 +128,7 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
           export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
           TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
           echo "TEST_LIST ${TEST_LIST}"
-          echo "PT_HPU_LAZY_MODE ${PT_HPU_LAZY_MODE}"
           pytest --verbose unit/ -k "${TEST_LIST}"
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index 8a60823050aba..d394b7e24bd60 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -46,7 +46,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          git checkout bdf36dc
+          git checkout v4.42.4
           git rev-parse --short HEAD
           pip install .
 
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index b1e8c042214fe..8658ff5d2348a 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -2,6 +2,9 @@ name: nv-nightly
 
 on:
   workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/nv-nightly.yml'
   schedule:
     - cron: "0 0 * * *"
 
@@ -25,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu117
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -34,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout v4.42.4
           git rev-parse --short HEAD
           pip install .
 
@@ -55,7 +58,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.7"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.4" --cuda_ver="11.8"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index 6afc11fddaab9..72ba8abbd95df 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -21,7 +21,7 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     container:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
 
@@ -36,7 +36,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
         - name: DS Report
           run: |
              ds_report
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 3109f6060944f..ebef2d35c2780 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -55,5 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.3" --cuda_ver="11.8"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.3" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8"
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index ba8a79f63469c..3103e3f36e84f 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -24,7 +24,7 @@ jobs:
         pyVersion: ["3.7", "3.8", "3.9", "3.10"]
       fail-fast: false
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     container:
       image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 2f571a14b228c..02881ef12f39b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   deploy:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     environment: release-env
 
     steps:
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
index c5a23fe3f53f7..1042db100a21f 100644
--- a/.github/workflows/xpu-max1100.yml
+++ b/.github/workflows/xpu-max1100.yml
@@ -36,38 +36,36 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, intel, xpu]
     container:
-      image: intel/intel-extension-for-pytorch:2.1.30-xpu
+      image: intel/oneapi-basekit:2024.1.1-devel-ubuntu22.04
       ports:
         - 80
       options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
 
     steps:
     - uses: actions/checkout@v4
-    - name: Check container state
-      shell: bash
-      run: |
-        ldd --version
-        python -c "import torch; print('torch:', torch.__version__, torch)"
-        python -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
-
-    - name: Install deepspeed
+    - name: Install prerequisite
       run: |
-        pip install py-cpuinfo
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip -y
+        pip install torch==2.1.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install intel-extension-for-pytorch==2.1.30+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install intel-extension-for-pytorch-deepspeed==2.1.30 -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install oneccl_bind_pt==2.1.300+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install torchvision==0.16.0.post2 -f https://developer.intel.com/ipex-whl-stable-xpu
+        pip install py-cpuinfo numpy==1.26
         pip install .[dev,autotuning]
-        ds_report
-        python -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
 
-    - name: Python environment
+    - name: Check container state
       run: |
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
         pip list
 
     - name: Unit tests
       run: |
-        pip install pytest pytest-timeout tabulate tensorboard wandb
-        export ONEAPI_ROOT=/opt/intel/oneapi/redist
-        export FI_PROVIDER_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib/prov
-        export LD_LIBRARY_PATH=$ONEAPI_ROOT/opt/mpi/libfabric/lib:$LD_LIBRARY_PATH
-        export LD_LIBRARY_PATH=$ONEAPI_ROOT/lib:$LD_LIBRARY_PATH
         cd tests/unit
         pytest --verbose accelerator/*
         pytest --verbose autotuning/*
@@ -75,8 +73,10 @@ jobs:
         pytest --verbose checkpoint/test_moe_checkpoint.py
         pytest --verbose checkpoint/test_shared_weights.py
         pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+        pytest --verbose model_parallelism/*
         pytest --verbose moe/test_moe_tp.py
         pytest --verbose monitor/*
+        pytest --verbose utils/*
         pytest --verbose runtime/test_ds_config_model.py
         pytest --verbose runtime/pipe/test_pipe_schedule.py
         pytest --verbose runtime/zero/test_zero_config.py
diff --git a/README.md b/README.md
index 5f990fd70d7d1..304169b567777 100755
--- a/README.md
+++ b/README.md
@@ -15,6 +15,9 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+
+
+* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/README.md) [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-gds/japanese/README.md)]
 * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md) [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
 * [2024/03] [DeepSpeed-FP6:The power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
 * [2024/01] [DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
index dd87461696cf6..1f407e86787ee 100644
--- a/accelerator/hpu_accelerator.py
+++ b/accelerator/hpu_accelerator.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import functools
 import os
 import pkgutil
 import importlib
@@ -41,9 +42,8 @@ def handles_memory_backpressure(self):
         return True
 
     def device_name(self, device_index=None):
-        if device_index is None:
-            return 'hpu'
-        return 'hpu:{}'.format(device_index)
+        # ignoring device_index.
+        return 'hpu'
 
     def device(self, device_index=None):
         return torch.device(self.device_name(device_index))
@@ -196,31 +196,31 @@ def replay_graph(self, graph):
     # Tensor operations
     @property
     def BFloat16Tensor(self):
-        return self.hpu.BFloat16Tensor
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='hpu')
 
     @property
     def ByteTensor(self):
-        return self.hpu.ByteTensor
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='hpu')
 
     @property
     def DoubleTensor(self):
-        return self.hpu.DoubleTensor
+        return functools.partial(torch.tensor, dtype=torch.double, device='hpu')
 
     @property
     def FloatTensor(self):
-        return self.hpu.FloatTensor
+        return functools.partial(torch.tensor, dtype=torch.float, device='hpu')
 
     @property
     def HalfTensor(self):
-        return self.hpu.HalfTensor
+        return functools.partial(torch.tensor, dtype=torch.half, device='hpu')
 
     @property
     def IntTensor(self):
-        return self.hpu.IntTensor
+        return functools.partial(torch.tensor, dtype=torch.int, device='hpu')
 
     @property
     def LongTensor(self):
-        return self.hpu.LongTensor
+        return functools.partial(torch.tensor, dtype=torch.long, device='hpu')
 
     def pin_memory(self, tensor, align_bytes=1):
         return tensor.pin_memory(self.device())
@@ -297,7 +297,11 @@ def export_envs(self):
         return []
 
     def visible_devices_envs(self):
-        return ['HABANA_VISIBLE_MODULES']
+        # Current way deepspeed set this env var is not applicable with all HPU instances
+        # User has to follow instructions in:
+        # https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html
+        # keeping CUDA_VISIBLE_DEVICES
+        return ['CUDA_VISIBLE_DEVICES']  #['HABANA_VISIBLE_MODULES']
 
     def set_visible_devices_envs(self, current_env, local_accelerator_ids):
         for env in self.visible_devices_envs():
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
index 4fb107b0fa630..6da48000dafa4 100644
--- a/accelerator/xpu_accelerator.py
+++ b/accelerator/xpu_accelerator.py
@@ -26,7 +26,11 @@ def is_synchronized_device(self):
         return False
 
     def use_host_timers(self):
-        return self.is_synchronized_device()
+        # WA XPU event will be consolidated in 2.5
+        if ipex.__version__ < '2.5':
+            return True
+        else:
+            return self.is_synchronized_device()
 
     def resolves_data_dependency(self):
         return self.is_synchronized_device()
diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
index e25340619a211..e287af2540ed4 100644
--- a/blogs/deepspeed-fastgen/README.md
+++ b/blogs/deepspeed-fastgen/README.md
@@ -231,7 +231,10 @@ We currently support the following model architectures in this alpha release of
 * [Falcon](https://huggingface.co/models?other=falcon)
 * [Mixtral](https://huggingface.co/models?other=mixtral)
 * [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Phi-3](https://huggingface.co/models?other=phi3)
 * [Qwen](https://huggingface.co/models?other=qwen)
+* [Qwen2](https://huggingface.co/models?other=qwen2)
+* [Qwen2-MoE](https://huggingface.co/models?other=qwen2_moe)
 
 All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
 
diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md
index fb9cc7319ab62..1e92e4169450b 100644
--- a/blogs/deepspeed-fastgen/chinese/README.md
+++ b/blogs/deepspeed-fastgen/chinese/README.md
@@ -226,6 +226,10 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII
 * [LLaMA](https://huggingface.co/models?other=llama) 和 [LLaMA-2](https://huggingface.co/models?other=llama-2)
 * [Mistral](https://huggingface.co/models?other=mistral)
 * [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
+* [Mixtral](https://huggingface.co/models?other=mixtral)
+* [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Qwen](https://huggingface.co/models?other=qwen)
 
 所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。
 
diff --git a/blogs/deepspeed-gds/README.md b/blogs/deepspeed-gds/README.md
new file mode 100644
index 0000000000000..34416c07ea4d6
--- /dev/null
+++ b/blogs/deepspeed-gds/README.md
@@ -0,0 +1,88 @@
+<div align="center">
+
+# DeepNVMe: Improving DL Applications through I/O Optimizations
+
+</div>
+
+# Introduction
+
+Deep Learning (DL) continues to drive unprecedented advancements across important
+Artificial Intelligence domains including language, speech, video, and multimodal applications.
+A key factor to these advancements is dramatic scalability on multiple dimensions including model size,
+sequence length, and hardware parallelism. From a system perspective, DL scalability puts significant
+pressure on essential subsystems including computation, memory, communication, and storage. However,
+existing DL optimization efforts have mostly neglected the storage subsystem, making I/O operations such
+as data loading, model checkpointing, and offloading the main bottlenecks of large-scale DL. To address
+this problem, DeepSpeed has created a suite of I/O optimizations collectively called DeepNVMe.
+
+DeepNVMe improves the performance and efficiency of I/O-bound DL applications by accelerating I/O operations
+and reducing hardware requirements. It achieves this by leveraging storage innovations such as Non-Volatile
+Memory Express (NVMe) Solid Storage Devices (SSDs) and NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS). In this
+blog we show the benefits of DeepNVMe using microbenchmarks and an inference application. In experiments
+conducted on an Azure NC96ads\_A100\_v4 VM, we observed that DeepNVMe saturates available NVMe bandwidth for
+data transfers with GPU or CPU memory, achieving up to 10GB/sec reads and 5 GB/secs writes.
+
+# Background
+High-performance access to persistent storage is a common challenge in many computing domains, including DL. Thus, a significant number of hardware and software solutions have been proposed. DeepNVMe builds on three such solutions: (1) NVMe SSDs, (2) NVIDIA GDS, and (3) Linux Asynchronous I/O (libaio). We will briefly describe each of these technologies.
+
+NVMe SSDs are Flash-based storage devices that are replacing much slower hard disk drives (HDD) as primary persistent storage in modern servers. For example, an Azure NC96ads\_A100\_v4 VM is equipped with four NVMe SSDs which are individually capable of 3.25 GB/sec reads and can be combined in a RAID-0 configuration for a theoretical aggregate read bandwidth of 13 GB/sec. NVIDIA GDS enables direct transfers between NVMe and GPU memory thus avoiding the inefficiencies of the traditional approach of using intermediate CPU memory (bounce buffer). NVIDIA GDS is generally available in CUDA versions 11.4 and above. Finally, libaio is an asynchronous I/O stack introduced in Linux to better extract raw performance of fast storage devices like NVMe SSDs compared to the traditional I/O stack.
+
+# DeepNVMe: an Optimization Module for Deep Learning I/O
+
+DeepNVMe is a Python module that we developed with two key design principles. First, it leverages the above discussed storage technologies to implement powerful optimizations such as non-blocking I/O operations, bulk submission of I/O operations, parallelization of an individual I/O operation, and a lightweight runtime. Second, it exposes these I/O optimizations through a simple POSIX-like interface to foster easy integration into DL applications while avoiding the complexities of the underlying technologies.
+
+# Evaluation
+
+Our experiments are conducted on an Azure NC96ads\_A100\_v4 VM with setup details summarized in Table 1. For multi-device experiments, the SSDs are combined in a RAID-0 configuration.
+
+<img src="./media/table1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Table 1: Experimental setup details
+</div>
+
+## Microbenchmark Performance
+
+We used three benchmarking tools for our evaluations. The first is fio, the popular I/O benchmarking tool written in C. The second is gdsio from NVIDIA for benchmarking GDS performance. The third is ds\_io, a Python tool that we created for easy integration with DeepNVMe and to be more representative of DL applications which are commonly Python-based.
+
+## High-Performance I/O with CPU Buffers via NVMe Scaling
+
+Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
+
+<img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 1: Using DeepNVMe to scale data transfers between NVMe and CPU buffer
+</div>
+
+## High-Performance I/O with GPU Buffers via NVMe Scaling
+
+Our second set of microbenchmark evaluations used gdsio and ds\_io to measure the performance of 1GB data transfer between NVMe and GPU memory. For this experiment, we configure ds\_io to use both the traditional bounce buffer approach and the more efficient GDS approach. The results are summarized in Figure 2, from which we make three observations. First, we see that GDS improves performance in DeepNVMe compared to the traditional bounce buffer approach, with up to 37% speedup. Second, DeepNVMe demonstrates high performance by matching (and sometimes surpassing) gdsio despite being more representative of DL applications. Third, we see that DeepNVMe, with and without GDS, scales I/O performance with available NVMe bandwidth. With GDS, DeepNVMe achieves a maximum of 9.6GB/sec reads and 5GB/sec writes, and without GDS achieves 7GB/sec reads and 4GB/sec writes.
+
+<img src="./media/figure2.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 2: Using DeepNVMe to scale data transfers between NVMe and GPU memory
+</div>
+
+## ZeRO-Inference: Generative AI Performance
+
+ZeRO-Inference is an AI democratization technology that reduces the hardware cost of inferencing massive models by using DeepNVMe to offload model weights to CPU or NVMe memory.  ZeRO-Inference is well suited for throughput-oriented applications, such as offline inferencing, and for scenarios with limited hardware budget. We use token generation workload to evaluate DeepNVMe performance for NVMe offloading.
+
+## High-Performance Offloading via NVMe Scaling
+
+We measure the generation throughput of inferencing a LLAMA3-70B model on a single NVIDIA A100-80GB with a prompt length of 512, generation length of 32, and batch size of 96. We scale the number of NVMe SSDs from 1 to 4 and present the results for ZeRO-Inference with and without GDS in Figure 3.  We make two observations from these results. First, GDS consistently provides better performance compared to the bounce buffer approach, achieving 10-18% faster token generation. Second, DeepNVMe, with and without GDS, scales generation performance with available NVMe bandwidth. With four NVMe SSDs, DeepNVMe achieves generation throughput rates of 7 tokens per second with GDS and 6 tokens per second without GDS. Our profiling results suggest that DeepNVMe will continue to scale with more NVMe bandwidth, making it an economic option for boosting generative application performance.
+
+<img src="./media/figure3.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 3: Using DeepNVMe to scale LLAMA3-70B token generation performance with NVMe offloading.
+</div>
+
+# Summary
+
+In this blog post, we introduced DeepNVMe, an I/O optimization technology created to tackle the emergence of I/O operations as key bottlenecks of Deep Learning scalability. DeepNVMe enables fast and efficient data transfers between persistent storage and DL application memory through optimizations built on popular storage technologies such as NVMe SSDs and NVIDIA GDS. We showed benefits of using DeepNVMe for LLAMA3-70B token generation on single A100-80GB GPU with NVMe offloading, for which it achieves up to 7 tokens per second in generation throughput on an Azure NC96ads\_A100\_v4 VM. DeepNVMe will be open-sourced and generally available in DeepSpeed versions >= [0.15.0](https://github.com/microsoft/DeepSpeed/releases/tag/v0.15.0).  In future blogs, we will report DeepNVMe improvements for other I/O bound DL applications such as model checkpointing and data loading.
+
+
+# Acknowlegements
+This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
diff --git a/blogs/deepspeed-gds/japanese/README.md b/blogs/deepspeed-gds/japanese/README.md
new file mode 100644
index 0000000000000..8d65d5225b160
--- /dev/null
+++ b/blogs/deepspeed-gds/japanese/README.md
@@ -0,0 +1,77 @@
+<div align="center">
+
+# DeepNVMe: I/O最適化による深層学習アプリケーションの高速化
+
+</div>
+
+# はじめに
+
+深層学習（Deep Learning）は、言語、音声、ビデオ、マルチモーダルアプリケーションなどの重要なAIの応用領域において、かつてない進歩を続けています。この進歩の鍵となる要因は、モデルサイズ、シーケンス長、ハードウェア並列性などの複数の次元での劇的なスケーラビリティです。システムの観点から見ると、深層学習のスケーラビリティは計算、メモリ、通信、ストレージなどの重要なサブシステムに大きな負荷をかけます。しかし、既存の取り組みは、ストレージサブシステムの最適化はほとんど扱われておらず、データロード、モデルチェックポイント、オフロードなどのI/O操作が大規模な深層学習の主要なボトルネックとなっています。この問題に対処するために、DeepSpeedは一連のI/O最適化機能を「DeepNVMe」と呼ばれる形で提供します。
+
+DeepNVMeは、I/O操作の高速化とハードウェア要件の緩和によって、I/Oがボトルネックとなる深層学習アプリケーションのパフォーマンスと効率を向上させます。これを実現するために、Non-Volatile Memory Express（NVMe）やSSD、NVIDIA Magnum IO `<sup>`TM `</sup>` GPUDirect® Storage（GDS）などのストレージ技術を活用しています。このブログでは、マイクロベンチマークと推論アプリケーションの性能評価結果に基づいて、DeepNVMeの利点を示します。Azure NC96ads_A100_v4 VMで実施された実験では、DeepNVMeがGPUまたはCPUメモリへのデータ転送で利用可能なNVMe帯域幅を最大限に活用し、最大10GB/秒の読み取りと5GB/秒の書き込みを達成しました。
+
+# 背景
+
+永続ストレージへの高性能アクセスは、深層学習を含む多くのコンピューティングドメインで共通の課題です。これに対して、多くのハードウェアおよびソフトウェアソリューションが提案されています。DeepNVMeは、以下の3つのソリューションを基に構築されています。(1) NVMe SSD、(2) NVIDIA GDS、(3) Linux非同期I/O（libaio）。これらの技術について簡単に説明します。
+
+NVMe SSDは、現代のサーバーで主要な永続ストレージとして、従来の遅いハードディスクドライブ（HDD）に取って代わるフラッシュベースのストレージデバイスです。たとえば、Azure NC96ads_A100_v4 VMには4つのNVMe SSDが装備されており、それぞれが3.25 GB/秒の読み取り速度を持ち、RAID-0構成で組み合わせると理論上の合計読み取り帯域幅は13 GB/秒となります。NVIDIA GDSは、NVMeとGPUメモリ間の直接転送を可能にすることで、中間のCPUメモリ（バウンスバッファ）を使用する従来のアプローチの非効率を回避します。NVIDIA GDSは、CUDAバージョン11.4以上で利用可能です。最後に、libaioは、従来のI/Oスタックと比較して、NVMe SSDのような高速ストレージデバイスの性能をより引き出すためにLinuxに導入された非同期I/Oスタックです。
+
+# DeepNVMe: 深層学習のためのI/O最適化モジュール
+
+DeepNVMeは、以下の2つの主要な設計原則に基づいて開発されたPythonモジュールです。第一に、上記のストレージ技術を活用して、ノンブロッキングI/O操作、I/O操作の一括送信、個々のI/O操作の並列化、軽量なランタイムなどの最適化を実装しています。第二に、これらのI/O最適化をシンプルなPOSIXライクなインターフェースを通じて提供し、深層学習アプリケーションへの容易な統合を促進し、基盤となっている複雑な技術を直接扱うことなく、その性能を活用することを可能にします。
+
+# 評価
+
+実験は、Azure NC96ads_A100_v4 VMで実施されました。設定の詳細は表1の通りです。
+
+<img src="../media/table1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+表1: 実験設定の詳細
+</div>
+
+## マイクロベンチマーク
+
+評価には3つのベンチマークツールを使用しました。一つ目は、C言語で書かれた一般的なI/Oベンチマークツールであるfioです。次に、GDSパフォーマンスのベンチマークを行うためのNVIDIAのgdsioです。最後に、DeepNVMeとの容易な統合のために我々た作成したds_ioです。ds_ioは、深層学習アプリケーションで代表的に使用されるPythonで作成されています。
+
+## CPUバッファを使用したNVMeスケーリングによる高性能I/O
+
+最初のマイクロベンチマーク評価では、fioとds_ioを使用して、NVMeとCPUメモリ間で1GBのデータを転送するパフォーマンスを測定しました。これらの実験ではfioをlibaioバックエンドに設定しました。結果は図1の通りです。ここから、2つの点が読み取れます。第一に、DeepNVMeは、深層学習アプリケーションにおける性能改善を目指したものであるにも関わらず、このマイクロベンチマークでもfioに匹敵する高性能を示しています。第二に、DeepNVMeは、利用可能なNVMe帯域幅にほぼ線形にスケールし、10GB/秒の読み取りおよび5GB/秒の書き込み速度を達成しています。
+
+<img src="../media/figure1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+図1: DeepNVMeを使用したNVMeとCPUバッファ間のデータ転送のスケーリング
+</div>
+
+## GPUバッファを使用したNVMeスケーリングによる高性能I/O
+
+二つ目のマイクロベンチマーク評価では、gdsioとds_ioを使用して、NVMeとGPUメモリ間で1GBのデータ転送のパフォーマンスを測定しました。この実験では、ds_ioを従来のバウンスバッファアプローチとより効率的なGDSアプローチの両方で設定します。結果は図2の通りです。ここから、次の3点が観察できます。第一にGDSを用いるケースで、従来のバウンスバッファアプローチと比較して、DeepNVMeは最大で37%のスピードアップを実現しています。第二に、DeepNVMeは、深層学習アプリケーションのために作成されたものであるにも関わらず、gdsioに匹敵する（時にはそれを上回る）高性能を示します。第三に、DeepNVMeは、GDSの有無にかかわらず、NVMe帯域幅を最大限に活用できます。GDSを使用した場合、DeepNVMeは最大9.6GB/秒の読み取りおよび5GB/秒の書き込み速度を達成し、GDSを使用しない場合は7GB/秒の読み取りおよび4GB/秒の書き込み速度を達成します。
+
+<img src="../media/figure2.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+図2: DeepNVMeを使用したNVMeとGPUメモリ間のデータ転送のスケーリング
+</div>
+
+## ZeRO-Inference: 生成AIパフォーマンス
+
+ZeRO-Inferenceは、モデルの重み（パラメータ）をCPUまたはNVMeメモリにオフロードすることで、大規模モデルの推論に必要なハードウェアコストを削減し、限られたハードウェア資源しかないユーザでも大規模モデルを活用できるようにするための技術です。ZeRO-Inferenceは、オフライン推論などのスループット指向のアプリケーションや、ハードウェア予算が限られているシナリオに適しています。DeepNVMeのNVMeオフロードのパフォーマンスを評価するために、トークン生成ワークロードを使用します。
+
+## NVMeスケーリングによる高性能オフロード
+
+LLAMA3-70Bモデルの推論を単一のNVIDIA A100-80GBで、プロンプト長512、生成長32、バッチサイズ96で実行し、生成スループットを測定します。NVMe SSDの数を1から4までスケーリングし、GDSの有無でZeRO-Inferenceの結果を図3に示します。この結果から、2つの観察ができます。第一に、GDSはバウンスバッファアプローチと比較して一貫して優れたパフォーマンスを提供し、トークン生成を10-18%高速化します。第二に、DeepNVMeは、GDSの有無にかかわらず、利用可能なNVMe帯域幅にスケールします。4つのNVMe SSDを使用する場合、DeepNVMeはGDSを使用して1秒あたり7トークン、GDSを使用しない場合は1秒あたり6トークンの生成スループットを達成します。プロファイリング結果は、DeepNVMeがより多くのNVMe帯域幅で引き続きスケールし、生成アプリケーションのパフォーマンスを低コストで向上できることを示しています。
+
+<img src="../media/figure3.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+図3: DeepNVMeを使用したLLAMA3-70Bトークン生成パフォーマンスのNVMeオフロードによるスケーリング
+</div>
+
+# まとめ
+
+このブログ記事では、深層学習のスケーラビリティにおいて主要なボトルネックとなるI/O操作を最適化する、DeepNVMeを紹介しました。DeepNVMeは、NVMe SSDやNVIDIA GDSなどのストレージ技術に基づいた最適化を通じて、永続ストレージと深層学習アプリケーションのデータ転送を高速かつ効率的に実現します。Azure NC96ads_A100_v4 VMでの単一A100-80GB GPUを使用したLLAMA3-70Bトークン生成において、DeepNVMeを使用することで、NVMeオフロードで最大7トークン/秒の生成スループットを達成しました。DeepNVMeはオープンソース化され、DeepSpeedバージョン[0.15.0](https://github.com/microsoft/DeepSpeed/releases/tag/v0.15.0).以上で利用可能です。今後のブログでは、モデルチェックポイントやデータロードなどの他のI/Oがボトルネックとなる深層学習アプリケーションに対するDeepNVMeの改善について報告します。
+
+# 謝辞
+
+この成果は、MicrosoftとNVIDIAの協力によるものです。MicrosoftからはJoe Mayer、Martin Cai、Olatunji Ruwase、NVIDIAからはKiran Modukuri、Vahid Noormofidi、Sourab Gupta、Sandeep Joshiが貢献しました。
diff --git a/blogs/deepspeed-gds/media/figure1.png b/blogs/deepspeed-gds/media/figure1.png
new file mode 100755
index 0000000000000..08db7d2f8afaf
Binary files /dev/null and b/blogs/deepspeed-gds/media/figure1.png differ
diff --git a/blogs/deepspeed-gds/media/figure2.png b/blogs/deepspeed-gds/media/figure2.png
new file mode 100755
index 0000000000000..35be5d4c40158
Binary files /dev/null and b/blogs/deepspeed-gds/media/figure2.png differ
diff --git a/blogs/deepspeed-gds/media/figure3.png b/blogs/deepspeed-gds/media/figure3.png
new file mode 100755
index 0000000000000..7175236f886b3
Binary files /dev/null and b/blogs/deepspeed-gds/media/figure3.png differ
diff --git a/blogs/deepspeed-gds/media/table1.png b/blogs/deepspeed-gds/media/table1.png
new file mode 100755
index 0000000000000..bba5713699320
Binary files /dev/null and b/blogs/deepspeed-gds/media/table1.png differ
diff --git a/blogs/windows/08-2024/README.md b/blogs/windows/08-2024/README.md
new file mode 100644
index 0000000000000..34e11bd47792b
--- /dev/null
+++ b/blogs/windows/08-2024/README.md
@@ -0,0 +1,101 @@
+<div align="center">
+
+# DeepSpeed on Windows
+
+</div>
+
+# Introduction
+
+DeepSpeed is a popular open-source deep learning optimization library that makes distributed training and inference easy, efficient, and effective. DeepSpeed has been widely used to train a variety of state-of-the-art models, including Phi-3, Megatron-Turing-530B, BLOOM-176B, and Arctic because of its rich suite of sophisticated optimizations (e.g., ZeRO, 3D parallelism, MoE, etc.). However, the lack of native support for Microsoft Windows, the most popular operating system, means that DeepSpeed innovations are inaccessible to many AI developers and users. To address this problem, we started an effort to make DeepSpeed run natively with full features on Windows, while ensuring the same ease-of-use enjoyed on Linux.
+
+In this blog, we are pleased to announce some early achievements on this journey: DeepSpeed can now be installed in Windows and run natively for single-GPU training, finetuning, and inferencing. Importantly, both the installation and usage experiences are identical to those on Linux. Furthermore, the finetuning and inferencing workloads demonstrate the functioning of three critical DeepSpeed features, HuggingFace Transformers integration, LoRA support, and CPU Offloading. DeepSpeed on Windows is available in DeepSpeed versions 0.14.5 and above. In the rest of this blog, we present examples to demonstrate these achievements.
+
+# Evaluation Environment
+We conducted the experiments on a Surface Laptop Studio 2 running Windows 11 Version 23H2 and Build 22631.3880. The laptop is equipped with a single NVIDIA RTX A2000 GPU with 4GB VRAM. We used Pytorch version 2.3.0 and HuggingFace Transformers version 4.41.2. The example scripts used are from the [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples), therefore you need to clone the repo before running any of the following examples.
+
+# Installation
+DeepSpeed can be installed on Windows in one of two ways. The easier way is to use the pip package manager, while the other is to build from source. The prerequisites for in both cases are Python 3.x and Pytorch with CUDA support.
+
+## Installing via pip
+To install DeepSpeed, simply run: `pip install deepspeed`. This will install the latest version of DeepSpeed (0.14.5 at this time). Unlike the Linux counterpart, the Windows version comes with all the operators already prebuilt, so there is no need to have a CUDA SDK or C++ compiler installed.
+
+<div align="center">
+    <img src="./media/win_pip_install_deepspeed.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    pip installation of DeepSpeed on Windows.
+</div>
+
+
+## Building from Source
+To build DeepSpeed from source, you need to clone the DeepSpeed repository and run the `build_win.bat` compilation script.
+
+
+## Validating Installation
+Regardless of the installation choice, you can check that the installation was successful by running ds_report. The output should look like this:
+
+
+<div align="center">
+    <img src="./media/ds_report.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    ds_report output confirming  Windows installation of DeepSpeed.
+</div>
+
+# Pretraining Examples
+We use an image classification model, CIFAR10, and a language model, BERT, to demonstrate pretraining on Windows with DeepSpeed.
+
+## Pretraining CIFAR10
+The scripts and codes required for CIFAR10 pretraining example are available in the following path: DeepSpeedExamples\training\cifar. You can launch the CIFAR10 pretraining experiment using the following command: `deepspeed cifar10_deepspeed.py –deepspeed`. The final output should look something like this:
+<div align="center">
+    <img src="./media/cifar10_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    Pretraining CIFAR10 model on Windows using DeepSpeed.
+</div>
+
+## Pretraining BERT
+The scripts and codes for the BERT pretraining example are available in the following path: DeepSpeedExamples\training\HelloDeepSpeed. You can launch the BERT pretraining experiment using the following command: `deepspeed train_bert_ds.py --checkpoint_dir experiment_deepspeed`. The final output should look like this:
+
+<div align="center">
+    <img src="./media/bert_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    Pretraining BERT model on Windows using DeepSpeed.
+</div>
+
+# Fine Tuning Example
+We demonstrate fine tuning capability by using the supervised fine tuning (SFT) step of DeepSpeed-Chat application. We conduct SFT of the HuggingFace facebook/opt-125m model while enabling LoRA and CPU offloading memory optimizations. The command line for running this example is as follows:
+deepspeed training\step1_supervised_finetuning\main.py --model_name_or_path facebook/opt-125m --gradient_accumulation_steps 8 --lora_dim 128 --only_optimize_lora --print_loss --zero_stage 2 --deepspeed --dtype bf16 --offload --output_dir output
+The output should look like this:
+
+<div align="center">
+    <img src="./media/opt125m_finetuning.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    Supervised Finetuning of facebook/opt-125m model on Windows using DeepSpeed.
+</div>
+
+# Inference Example
+We demonstrate inference capability by using ZeRO-Inference for token generation. ZeRO-Inference reduces hardware cost of inferencing by offloading to CPU or NVMe memories. We use the example scripts here to run token generation using Llama-2-7B model from HuggingFace. We offload the model weights to CPU memory since the 4GB VRAM is insufficient to host both the model and the generation working set. We use the following command line to generate 32 tokens from a prompt of 8 tokens:
+deepspeed run_model.py --model meta-llama/Llama-2-7b-hf --batch-size 64 --prompt-len 8 --gen-len 32 --cpu-offload
+The output will look something like this:
+
+<div align="center">
+    <img src="./media/llama2-7b_inference.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    LLAMA2-7B token generation on Windows using ZeRO-Inference.
+</div>
+
+# Summary
+Enabling DeepSpeed, a popular deep learning framework, to run natively on Windows, the most popular operating system, is a crucial step towards empowering every person and every organization to benefit from the ongoing AI revolution.  In this blog, we have shared early results of our work towards this goal. Although Windows support of DeepSpeed is a work-in-progress, we hope that the above updates are encouraging and already useful to users. The next items on our roadmap include running on multiple GPUs, weight quantization, and performance studies.
+
+# Acknowledgements
+This work is a result of significant contributions from current and former DeepSpeed members including Costin Eseanu, Logan Adams, Elton Zheng, Reza Yazdani Aminabadi, Martin Cai, and Olatunji Ruwase. We also acknowledge the valuable contributions of DeepSpeed users who righteously demanded this feature, provided critical workarounds, partial solutions, and constructive feedback, and most importantly, stuck with us.
diff --git a/blogs/windows/08-2024/media/bert_training.png b/blogs/windows/08-2024/media/bert_training.png
new file mode 100644
index 0000000000000..c5935e47747e8
Binary files /dev/null and b/blogs/windows/08-2024/media/bert_training.png differ
diff --git a/blogs/windows/08-2024/media/cifar10_training.png b/blogs/windows/08-2024/media/cifar10_training.png
new file mode 100644
index 0000000000000..99f3fa25bc708
Binary files /dev/null and b/blogs/windows/08-2024/media/cifar10_training.png differ
diff --git a/blogs/windows/08-2024/media/ds_report.png b/blogs/windows/08-2024/media/ds_report.png
new file mode 100644
index 0000000000000..43d82d724ed24
Binary files /dev/null and b/blogs/windows/08-2024/media/ds_report.png differ
diff --git a/blogs/windows/08-2024/media/llama2-7b_inference.png b/blogs/windows/08-2024/media/llama2-7b_inference.png
new file mode 100644
index 0000000000000..f5874468a8543
Binary files /dev/null and b/blogs/windows/08-2024/media/llama2-7b_inference.png differ
diff --git a/blogs/windows/08-2024/media/opt125m_finetuning.png b/blogs/windows/08-2024/media/opt125m_finetuning.png
new file mode 100644
index 0000000000000..ed6d1522e3b3f
Binary files /dev/null and b/blogs/windows/08-2024/media/opt125m_finetuning.png differ
diff --git a/blogs/windows/08-2024/media/win_pip_install_deepspeed.png b/blogs/windows/08-2024/media/win_pip_install_deepspeed.png
new file mode 100644
index 0000000000000..3b87c95ef1447
Binary files /dev/null and b/blogs/windows/08-2024/media/win_pip_install_deepspeed.png differ
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 0f2895dfa328b..a65cc500cc821 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -301,9 +301,8 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     } while (r > 0);
 
     if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
+        std::cerr << "read error " << " read_bytes (read) = " << read_bytes
+                  << " num_bytes (fstat) = " << num_bytes << std::endl;
     }
     assert(read_bytes == num_bytes);
     close(fd);
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
new file mode 100644
index 0000000000000..dc820be528d05
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_aio_op_desc.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const long long int file_num_bytes,
+                           const int num_threads,
+                           const bool validate)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _file_num_bytes(file_num_bytes),
+      _num_threads(num_threads),
+      _num_bytes_per_thread(file_num_bytes / num_threads),
+      _validate(validate)
+{
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::finish() {}
+
+void io_op_desc_t::validate() {}
+
+void io_op_desc_t::run(const int tid,
+                       std::unique_ptr<aio_context>& aio_ctxt,
+                       deepspeed_aio_config_t* aio_config)
+{
+}
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
new file mode 100644
index 0000000000000..7305f6920c91a
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#ifndef _IO_OP_DESC_T_
+#define _IO_OP_DESC_T_
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const long long int _file_num_bytes;
+    const int _num_threads;
+    const int _num_bytes_per_thread;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const long long int file_num_bytes,
+                 const int num_threads,
+                 const bool validate);
+
+    virtual void run(const int tid,
+                     std::unique_ptr<aio_context>& aio_ctxt,
+                     deepspeed_aio_config_t* aio_config);
+
+    virtual char* data_ptr() const;
+
+    virtual void validate();
+
+    virtual void finish();
+};
+#endif  // _IO_OP_DESC_T_
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
index c852711a28c03..30c3b49143979 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -9,50 +9,8 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 #include "deepspeed_aio_thread.h"
 
-#if defined(__ENABLE_CANN__)
-#include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include "torch_npu/csrc/framework/utils/UtilForOpAdapter.h"
-#endif
-
 using namespace std;
 
-io_op_desc_t::io_op_desc_t(const bool read_op,
-                           const torch::Tensor& buffer,
-                           const int fd,
-                           const char* filename,
-                           const long long int num_bytes,
-                           const bool validate)
-    : _read_op(read_op),
-      _buffer(buffer),
-      _fd(fd),
-      _filename(filename),
-      _num_bytes(num_bytes),
-      _validate(validate)
-{
-    _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu()
-#if defined(__ENABLE_CANN__)
-                   || torch_npu::utils::is_npu(_buffer)
-#endif
-                       )
-                      ? _buffer.to(torch::kCPU).pin_memory()
-                      : _buffer;
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void io_op_desc_t::fini()
-{
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
-#if defined(__ENABLE_CANN__)
-    if (_read_op && torch_npu::utils::is_npu(_buffer)) {
-        auto device = at::Device("npu:0");
-        _buffer.copy_(_cpu_buffer.to(device));
-    }
-#endif
-}
-
 deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
     : _tid(tid),
       _aio_config(aio_config),
@@ -79,18 +37,7 @@ void deepspeed_aio_thread_t::run()
         }
 
         if (next_io_op) {
-            const auto base_offset = next_io_op->_num_bytes * _tid;
-
-            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
-                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
-
-            if (_aio_config._overlap_events) {
-                do_aio_operation_overlap(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            } else {
-                do_aio_operation_sequential(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            }
+            next_io_op->run(_tid, _aio_ctxt, &_aio_config);
 
             {
                 std::lock_guard<std::mutex> lock(_complete_sync._mutex);
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index 20799ecbb018e..a192804db13d8 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -10,28 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <condition_variable>
 #include <memory>
 #include <queue>
-#include "deepspeed_py_aio.h"
-
-struct io_op_desc_t {
-    const bool _read_op;
-    torch::Tensor _buffer;
-    int _fd;
-    const std::string _filename;
-    const long long int _num_bytes;
-    torch::Tensor _cpu_buffer;
-    torch::Tensor _contiguous_buffer;
-    const bool _validate;
-
-    io_op_desc_t(const bool read_op,
-                 const torch::Tensor& buffer,
-                 const int fd,
-                 const char* filename,
-                 const long long int num_bytes,
-                 const bool validate);
-
-    char* data_ptr() const;
-    void fini();
-};
+#include "deepspeed_cpu_op.h"
 
 struct thread_sync_t {
     std::mutex _mutex;
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
new file mode 100644
index 0000000000000..41790b99bb889
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_cpu_op.h"
+
+using namespace std;
+
+cpu_op_desc_t::cpu_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate),
+      _cpu_buffer(buffer)
+{
+    // Need to use CPU bounce buffer if buffer is not a page-locked DRAM memory.
+    _use_bounce_buffer = !(_buffer.is_cpu() && _buffer.is_pinned());
+    if (_use_bounce_buffer) {
+        if (_read_op) {
+            auto options = torch::TensorOptions()
+                               .dtype(_buffer.dtype())
+                               .layout(_buffer.layout())
+                               .device(torch::kCPU);
+            _cpu_buffer = torch::empty(_buffer.nbytes(), options).pin_memory();
+        } else {
+            _cpu_buffer = _buffer.to(torch::kCPU).pin_memory();
+        }
+    }
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void cpu_op_desc_t::finish()
+{
+    if (_read_op) {
+        if (_buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+        if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+#if defined(__ENABLE_CANN__)
+        if (torch_npu::utils::is_npu(_buffer)) {
+            auto device = at::Device("npu:0");
+            _buffer.copy_(_cpu_buffer.to(device));
+        }
+#endif
+    }
+}
+
+void cpu_op_desc_t::validate()
+{
+    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
+}
+
+void cpu_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _num_threads);
+    const auto base_offset = _num_bytes_per_thread * tid;
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(_fd, base_offset, _num_bytes_per_thread, data_ptr()));
+
+    if (aio_config->_overlap_events) {
+        do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    }
+}
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
new file mode 100644
index 0000000000000..da96dd2b1d50b
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <memory>
+#include <queue>
+#include "deepspeed_aio_op_desc.h"
+
+struct cpu_op_desc_t : io_op_desc_t {
+    torch::Tensor _cpu_buffer;
+    bool _use_bounce_buffer;
+
+    cpu_op_desc_t(const bool read_op,
+                  const torch::Tensor& buffer,
+                  const int fd,
+                  const char* filename,
+                  const long long int file_num_bytes,
+                  const int num_threads,
+                  const bool validate);
+
+    void run(const int tid,
+             std::unique_ptr<aio_context>& aio_ctxt,
+             deepspeed_aio_config_t* aio_config);
+
+    char* data_ptr() const;
+
+    void validate();
+
+    void finish();
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 387b713f2bfc2..eac268d334338 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
@@ -72,9 +69,8 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
@@ -118,8 +114,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
index 11d5225de9f10..ba794db5440d5 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -4,10 +4,7 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+Functionality for swapping tensors to/from (NVMe) storage devices.
 */
 
 #include <deepspeed_aio_common.h>
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index c21e92de94496..c7ca5e82afdef 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -4,295 +4,21 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_aio_handle.h"
+#include <cstdlib>
 
 using namespace std;
 
-static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
-
 deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
                                                const int num_threads)
-    : _aio_ctxt(new aio_context(block_size, queue_depth)),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _num_threads(num_threads),
-      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0),
-      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
-{
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
-}
-
-deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
-{
-    _stop_threads();
-    for (auto& thr : _threads) { thr.join(); }
-}
-
-const int deepspeed_aio_handle_t::get_block_size() const
-{
-    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
-}
-
-const int deepspeed_aio_handle_t::get_queue_depth() const
-{
-    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
-}
-
-const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
-
-const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
-
-const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
-
-int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    assert(_aio_ctxt);
-
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-
-    close(fd);
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
 {
-    assert(_aio_ctxt);
-
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
 }
 
-void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
-{
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_work_queue.push(scheduled_op);
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-    _num_pending_ops++;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
-{
-    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
-    for (auto& ctxt : _thread_contexts) {
-        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
-        ctxt->_complete_sync._cond_var.wait(lock,
-                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
-        completed_op = ctxt->_complete_queue.front();
-        ctxt->_complete_queue.pop();
-    }
-    return completed_op;
-}
-
-void deepspeed_aio_handle_t::_stop_threads()
-{
-    assert(0 == _num_pending_ops);
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_time_to_exit = true;
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-}
-
-int deepspeed_aio_handle_t::wait()
-{
-    assert(_num_pending_ops > 0);
-    auto num_completed_ops = 0;
-
-    while (_num_pending_ops > 0) {
-        auto completed_op = _wait_for_aio_work();
-
-        completed_op->fini();
-
-        close(completed_op->_fd);
-
-        if (completed_op->_validate) {
-            validate_aio_operation(completed_op->_read_op,
-                                   completed_op->_filename.c_str(),
-                                   completed_op->data_ptr(),
-                                   _num_threads * completed_op->_num_bytes);
-        }
-        --_num_pending_ops;
-        ++num_completed_ops;
-    }
-
-    return num_completed_ops;
-}
-
-bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
-{
-    const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
-{
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
-{
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, true);
-}
-
-int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, true);
-}
-
-at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
-                                                         const torch::Tensor& example_tensor)
-{
-    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
-}
-
-bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
-{
-    return _pinned_tensor_mgr->free(locked_tensor);
-}
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t() {}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 3a254c3814a27..eb6b90ea22f03 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -9,21 +9,9 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 #include <condition_variable>
 #include <memory>
-#include "deepspeed_aio_thread.h"
-#include "deepspeed_pin_tensor.h"
-
-struct deepspeed_aio_handle_t {
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const int _num_threads;
-    deepspeed_aio_config_t _aio_config;
-
-    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
-    std::vector<std::thread> _threads;
-    int _num_pending_ops;
-    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
+#include "deepspeed_py_io_handle.h"
 
+struct deepspeed_aio_handle_t : deepspeed_io_handle_t {
     deepspeed_aio_handle_t(const int block_size,
                            const int queue_depth,
                            const bool single_submit,
@@ -31,47 +19,4 @@ struct deepspeed_aio_handle_t {
                            const int num_threads);
 
     ~deepspeed_aio_handle_t();
-
-    const int get_block_size() const;
-    const int get_queue_depth() const;
-    const bool get_single_submit() const;
-    const bool get_overlap_events() const;
-    const int get_thread_count() const;
-
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int pread(const torch::Tensor& buffer,
-              const char* filename,
-              const bool validate,
-              const bool async);
-
-    int pwrite(const torch::Tensor& buffer,
-               const char* filename,
-               const bool validate,
-               const bool async);
-
-    int sync_pread(torch::Tensor& buffer, const char* filename);
-
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int async_pread(torch::Tensor& buffer, const char* filename);
-
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    // TODO: Make API's args to be shape and dtype.
-    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
-
-    bool free_cpu_locked_tensor(torch::Tensor&);
-
-    int wait();
-
-    void _stop_threads();
-
-    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
-
-    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
-
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index 8a59107dd3474..f5480e9d9d836 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -4,13 +4,13 @@
 // DeepSpeed Team
 
 /*
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+Functionality for swapping tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
index 19ba28317d008..f443571a3e7b3 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
new file mode 100644
index 0000000000000..bdf2a858d797a
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -0,0 +1,300 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_io_handle.h"
+#include <cstdlib>
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
+                                             const int queue_depth,
+                                             const bool single_submit,
+                                             const bool overlap_events,
+                                             const int num_threads)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _num_threads(num_threads),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0),
+      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
+{
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_io_handle_t::~deepspeed_io_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_io_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_io_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_io_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_io_handle_t::get_thread_count() const { return _num_threads; }
+
+int deepspeed_io_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
+    return 0;
+}
+
+int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
+                                 const char* filename,
+                                 const bool validate)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
+    return 0;
+}
+
+void deepspeed_io_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_io_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_io_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        if (completed_op->_validate) { completed_op->validate(); }
+
+        completed_op->finish();
+
+        close(completed_op->_fd);
+
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op,
+                                                      const long long int num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_thread_count()) {
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const long long int file_num_bytes,
+    const bool validate)
+{
+    return std::make_shared<cpu_op_desc_t>(
+        read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+}
+
+int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
+                                 const char* filename,
+                                 const bool validate,
+                                 const bool async)
+{
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    if (buffer_bytes != num_file_bytes) {
+        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
+                  << " != " << num_file_bytes << std::endl;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert((num_file_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
+{
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    assert((num_write_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, false);
+}
+
+int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, false);
+}
+
+int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, true);
+}
+
+int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, true);
+}
+
+at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const size_t num_elem,
+                                                        const torch::Tensor& example_tensor)
+{
+    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
+}
+
+bool deepspeed_io_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
+{
+    return _pinned_tensor_mgr->free(locked_tensor);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
new file mode 100644
index 0000000000000..2974ebe87bfc1
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+#include "deepspeed_pin_tensor.h"
+
+struct deepspeed_io_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _num_threads;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
+
+    deepspeed_io_handle_t(const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const int num_threads);
+
+    virtual ~deepspeed_io_handle_t() = 0;
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_thread_count() const;
+
+    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int async_pread(torch::Tensor& buffer, const char* filename);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    // TODO: Make API's args to be shape and dtype.
+    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
+
+    bool free_cpu_locked_tensor(torch::Tensor&);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+
+    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(
+        const bool read_op,
+        const torch::Tensor& buffer,
+        const int fd,
+        const char* filename,
+        const long long int file_num_bytes,
+        const bool validate);
+};
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
old mode 100755
new mode 100644
index 9033549bc0d2e..3171d0c6bf3c2
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -10,6 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <torch/extension.h>
 #include "deepspeed_py_aio_handle.h"
 #include "deepspeed_py_copy.h"
+using namespace pybind11::literals;
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -20,7 +21,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
 
     py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const int>())
+        .def(py::init<const int, const int, const bool, const bool, const int>(),
+             "AIO handle constructor",
+             "block_size"_a = 1024 * 1024,
+             "queue_depth"_a = 128,
+             "single_submit"_a = false,
+             "overlap_events"_a = false,
+             "num_threads"_a = 1)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
@@ -28,19 +35,74 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
         .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
 
-        .def("read", &deepspeed_aio_handle_t::read)
-        .def("write", &deepspeed_aio_handle_t::write)
+        .def("read",
+             &deepspeed_aio_handle_t::read,
+             "Synchronous and non-parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
 
-        .def("pread", &deepspeed_aio_handle_t::pread)
-        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+        .def("write",
+             &deepspeed_aio_handle_t::write,
+             "Synchronous and non-parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
 
-        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+        .def("pread",
+             &deepspeed_aio_handle_t::pread,
+             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
 
-        .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
-        .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
+        .def("pwrite",
+             &deepspeed_aio_handle_t::pwrite,
+             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
 
-        .def("wait", &deepspeed_aio_handle_t::wait);
+        .def("sync_pread",
+             &deepspeed_aio_handle_t::sync_pread,
+             "Synchrononous parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("sync_pwrite",
+             &deepspeed_aio_handle_t::sync_pwrite,
+             "Synchronous parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pread",
+             &deepspeed_aio_handle_t::async_pread,
+             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
+             "following wait() returns count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pwrite",
+             &deepspeed_aio_handle_t::async_pwrite,
+             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
+             "count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("new_cpu_locked_tensor",
+             &deepspeed_aio_handle_t::new_cpu_locked_tensor,
+             "Allocate pinned CPU tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_cpu_locked_tensor",
+             &deepspeed_aio_handle_t::free_cpu_locked_tensor,
+             "Free pinned CPU tensor.",
+             "tensor"_a)
+
+        .def("wait",
+             &deepspeed_aio_handle_t::wait,
+             "Wait for (ongoing) asynchronous operations to complete");
 }
diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py
index 09d0e03c7ef67..7a0ab59ed73d1 100644
--- a/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/csrc/aio/py_test/aio_bench_generate_param.py
@@ -41,9 +41,9 @@ def convert_to_param(key):
     return {
         "single_submit": "true" if key[0] == "single" else "false",
         "overlap_events": "true" if key[1] == "overlap" else "false",
-        "thread_count": int(key[3]),
-        "queue_depth": int(key[4]),
-        "block_size": int(key[5])
+        "thread_count": int(key[5]),
+        "queue_depth": int(key[3]),
+        "block_size": int(key[4])
     }
 
 
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index 7d55f7ded65c2..ba95150b11e19 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -10,75 +10,47 @@
 import argparse
 import json
 import itertools
-import subprocess
 import shutil
 
-from test_ds_aio_utils import refine_integer_value
+from ds_aio_job import Job, run_job
 from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
-    READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+    READ_LOG_DIR, WRITE_LOG_DIR
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K", "256K"],
-    "queue_depth": [4, 16, 32],
-    "overlap_events": [True, False],
-    "io_parallel": [2, 8],
-    "single_submit": [False]
+    "block_size": ["128K", "1M"],
+    "queue_depth": [32, 64, 128],
+    "sequential_requests": [True, False],
+    "single_submit": [False],
+    "io_parallel": [1, 2, 8],
 }
 
 
-class Job(object):
-
-    def __init__(self, cmd_line, output_file=None, work_dir=None):
-        self.cmd_line = cmd_line
-        self.output_file = output_file
-        self.work_dir = work_dir
-        self.output_fd = None
-
-    def cmd(self):
-        return self.cmd_line
-
-    def get_stdout(self):
-        return self.output_fd
-
-    def get_stderr(self):
-        return self.output_fd
-
-    def get_cwd(self):
-        return self.work_dir
-
-    def open_output_file(self):
-        if self.output_file is not None:
-            self.output_fd = open(self.output_file, 'w')
-
-    def close_output_file(self):
-        if self.output_fd is not None:
-            self.output_fd.close()
-            self.output_fd = None
-
-
 class SweepConfig(object):
 
     def __init__(self, args):
-        self.nvme_dir = args.nvme_dir
-        self.io_size = args.io_size
+        self.folder_to_device_mapping = get_ftd_map(args.nvme_dir)
         self.search_space = get_sweep_config_dict(args.sweep_config)
+        self.search_space.update(self.folder_to_device_mapping)
         self.read = not args.no_read
         self.write = not args.no_write
         self.flush_cache = not args.no_sudo
         self.log_dir = args.log_dir
-        self.loops = args.loops
-        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}'
+        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
+        if args.gpu:
+            self.other_options += ' --gpu'
+        if args.gds:
+            self.other_options += ' --use_gds'
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
     parser.add_argument('--nvme_dir',
+                        nargs='+',
                         required=True,
-                        type=str,
                         help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
 
     parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
@@ -92,6 +64,10 @@ def parse_arguments():
                         default="400M",
                         help='Number of I/O bytes to read/write for performance measurements.')
 
+    parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
+
+    parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator')
+
     parser.add_argument(
         '--no_sudo',
         action='store_true',
@@ -118,6 +94,12 @@ def dump_cmd_lines(cmd_lines):
         print(f'{i}:  {cmd}')
 
 
+def get_ftd_map(nvme_dir_list):
+    ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)]
+    ftd_arg = [' '.join(ftd for ftd in ftd_list)]
+    return {'folder_to_device_mapping': ftd_arg}
+
+
 def get_sweep_config_dict(sweep_config_json):
     if sweep_config_json is None:
         return DEFAULT_SWEEP_CONFIG
@@ -148,16 +130,6 @@ def flatten_options(key, value_list):
     return cmd_list
 
 
-def run_job(job):
-    args = ' '.join(job.cmd())
-    print(f'args = {args}')
-    job.open_output_file()
-    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
-    job.close_output_file()
-    assert proc.returncode == 0, \
-    f"This command failed: {job.cmd()}"
-
-
 def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
     for perf_job in sweep_jobs:
         if flush_cache_job is not None:
@@ -176,7 +148,12 @@ def create_cmd_tags(cmd_line):
         if len(fields) == 1:
             tags[fields[0]] = None
         elif len(fields) == 2:
-            tags[fields[0]] = fields[1]
+            if fields[0] == '--folder_to_device_mapping':
+                tags[fields[0]] = len(fields[1:])
+            else:
+                tags[fields[0]] = fields[1]
+        elif len(fields) > 2:
+            tags[fields[0]] = len(fields[1:])
     return tags
 
 
@@ -184,16 +161,16 @@ def get_log_file(io_op_desc, cmd_line):
     QUEUE_DEPTH = "--queue_depth"
     BLOCK_SIZE = "--block_size"
     SINGLE_SUBMIT = "--single_submit"
-    OVERLAP_EVENTS = "--overlap_events"
-    THREAD_COUNT = "--threads"
+    SEQUENTIAL_REQUESTS = "--sequential_requests"
+    FTD_MAP = "--folder_to_device_mapping"
     IO_PARALLEL = "--io_parallel"
 
     tag_map = {
         QUEUE_DEPTH: "d",
         BLOCK_SIZE: "bs",
         SINGLE_SUBMIT: "single",
-        OVERLAP_EVENTS: "overlap",
-        THREAD_COUNT: "t",
+        SEQUENTIAL_REQUESTS: "sequential",
+        FTD_MAP: "ftd",
         IO_PARALLEL: "p"
     }
 
@@ -201,14 +178,14 @@ def get_log_file(io_op_desc, cmd_line):
         QUEUE_DEPTH: 1,
         BLOCK_SIZE: "1M",
         SINGLE_SUBMIT: "block",
-        OVERLAP_EVENTS: "sequential",
-        THREAD_COUNT: 1,
+        SEQUENTIAL_REQUESTS: "overlap",
+        FTD_MAP: 1,
         IO_PARALLEL: 1
     }
 
     def get_default_value(tag):
         value = tag_default[tag]
-        if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]:
+        if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]:
             return value
         return f'{tag_map[tag]}{value}'
 
@@ -218,7 +195,7 @@ def get_config_value(tag, value):
             return tag_key
         return f'{tag_key}{value}'
 
-    tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
+    tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL]
     log_tags = [io_op_desc]
     cmd_tags = create_cmd_tags(cmd_line)
     for tag in tag_list:
@@ -252,40 +229,14 @@ def async_io_setup():
     return AsyncIOBuilder().is_compatible()
 
 
-def get_block_size_and_count(io_bytes):
-    block_size = 1
-    block_count = io_bytes
-    bytes_in_KB = 1024
-
-    while block_count % bytes_in_KB == 0:
-        block_size *= bytes_in_KB
-        block_count /= bytes_in_KB
-
-    return int(block_size), int(block_count)
-
-
-def create_read_file(sweep_config):
-    read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}')
-    os.makedirs(read_folder, exist_ok=True)
-    read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
-    block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
-    print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    run_job(dd_job)
-    print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    return read_folder, read_file_name
-
-
 def remove_folder(folder):
     assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
     shutil.rmtree(folder)
 
 
 def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    read_folder, read_file_name = create_read_file(sweep_config)
-    read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
-    #dump_cmd_lines(read_cmd_lines)
+    read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    #dump_cmd_lines(cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
@@ -294,15 +245,9 @@ def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
 
     launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
-    remove_folder(read_folder)
-
 
 def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}')
-    os.makedirs(write_folder, exist_ok=True)
-    write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
-    write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(write_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
@@ -312,8 +257,6 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
 
     launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
-    remove_folder(write_folder)
-
 
 def main():
     print("Running performance sweep of deepspeed nvme library")
diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py
new file mode 100644
index 0000000000000..346feabe48109
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_args.py
@@ -0,0 +1,175 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import argparse
+import os
+from test_ds_aio_utils import refine_integer_value
+from deepspeed.accelerator import get_accelerator
+
+MAPPING_DELIMITER = ':'
+
+
+def refine_args(args):
+    if args.io_size and type(args.io_size) == str:
+        args.io_size = refine_integer_value(args.io_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+    return args
+
+
+def _get_mapping_dict(args):
+    if args.folder is not None:
+        d = {i: args.folder for i in range(args.multi_process)}
+    else:
+        d = {}
+        for m in args.folder_to_device_mapping:
+            fields = m.split(MAPPING_DELIMITER)
+            d[fields[1]] = fields[0]
+
+    return d
+
+
+def _validate_folder_mapping(args):
+    no_error = True
+    error_messages = []
+    invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m]
+    if len(invalid_mappings) > 0:
+        error_messages.append(
+            f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}')
+        no_error = False
+
+    folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping]
+    invalid_folders = [d for d in folder_list if not os.path.exists(d)]
+    if len(invalid_folders) > 0:
+        error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}')
+        no_error = False
+
+    if args.gpu:
+        device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping]
+        invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()]
+        if len(invalid_device_list) > 0:
+            error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}')
+            no_error = False
+
+    return no_error, error_messages
+
+
+def validate_args(args):
+    no_error = True
+    error_messages = []
+
+    if args.folder is not None and len(args.folder_to_device_mapping) > 0:
+        error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
+        no_error = False
+    elif args.folder is None and len(args.folder_to_device_mapping) == 0:
+        error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
+        no_error = False
+
+    # Validate --folder
+    if args.folder is not None and not os.path.exists(args.folder):
+        no_error = False
+        error_messages.append(f'Invalid folder in --folder: {args.folder} ')
+
+    # Validate --folder_mapping_to_device
+    if len(args.folder_to_device_mapping) > 0:
+        no_mapping_error, mapping_error_messages = _validate_folder_mapping(args)
+        no_error = no_error and no_mapping_error
+        error_messages += mapping_error_messages
+
+    # Validate --gpu, --use_gds
+    if args.use_gds and not args.gpu:
+        error_messages.append(f'--gpu must be set to transfer with --use_gds')
+        no_error = False
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation errors')
+        for i, msg in enumerate(error_messages):
+            print(f'{i+1}: {msg}')
+
+    return no_error
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.')
+
+    parser.add_argument('--folder_to_device_mapping',
+                        default=[],
+                        nargs='+',
+                        help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
+                        'Can be specified multiple times for multi-process runs,'
+                        'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu'
+                        'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
+
+    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
+
+    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
+
+    parser.add_argument('--multi_process',
+                        type=int,
+                        default=1,
+                        help='Number of parallel processes doing I/O (default 1).')
+
+    parser.add_argument('--block_size',
+                        type=str,
+                        default='1M',
+                        help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).')
+
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
+
+    parser.add_argument(
+        '--sequential_requests',
+        action='store_true',
+        help=
+        'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).'
+    )
+
+    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO')
+
+    parser.add_argument('--slow_bounce_buffer',
+                        action='store_true',
+                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def get_validated_args():
+    args = parse_arguments()
+    args = refine_args(args)
+    if not validate_args(args):
+        quit()
+    print(f'Successful validation of command line arguments')
+
+    peer_tag = 'gpu' if args.gpu else 'process'
+    args.mapping_dict = _get_mapping_dict(args)
+    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
+    assert len(args.mapping_dict) == len(args.mapping_list)
+    print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping')
+    for i, (device_id, folder) in enumerate(args.mapping_list):
+        print(f'[{i}]: {peer_tag} {device_id} <----> {folder}')
+
+    return args
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
index ad2a4349cd0c0..9b3c7cbfc49f9 100755
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -9,10 +9,9 @@
 import torch
 import os
 import time
+from deepspeed.ops.aio import AsyncIOBuilder
 from multiprocessing import Pool, Barrier
 from test_ds_aio_utils import report_results, task_log, task_barrier
-from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import AsyncIOBuilder
 
 
 def pre_basic(args, tid, read_op):
@@ -21,7 +20,7 @@ def pre_basic(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
     task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     ctxt = {}
@@ -56,7 +55,7 @@ def main_basic_read(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
     AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                     args.single_submit, args.overlap_events, args.validate)
+                                     args.single_submit, not args.sequential_requests, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -67,7 +66,7 @@ def main_basic_write(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
     AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                      args.single_submit, args.overlap_events, args.validate)
+                                      args.single_submit, not args.sequential_requests, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -90,16 +89,17 @@ def get_schedule(args, read_op):
 
 def _aio_handle_tasklet(pool_params):
     args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
 
     # Create schedule
     schedule = get_schedule(args, read_op)
     task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run pre task
     task_log(tid, f'running pre-task')
     ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run main tasks in a loop
     ctxt["main_task_sec"] = 0
@@ -107,14 +107,14 @@ def _aio_handle_tasklet(pool_params):
         task_log(tid, f'running main task {i}')
         start_time = time.time()
         ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
+        task_barrier(aio_barrier, num_processes)
         stop_time = time.time()
         ctxt["main_task_sec"] += stop_time - start_time
 
     # Run post task
     task_log(tid, f'running post-task')
     ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
@@ -125,9 +125,10 @@ def _init_tasklet(b):
 
 
 def aio_basic_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index d35b2713edae8..f4a179deb9ec6 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -10,40 +10,56 @@
 import os
 import time
 from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import AsyncIOBuilder
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
 
 
 def pre_handle(args, tid, read_op):
     io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                args.overlap_events, io_parallel)
-    task_log(tid, f'Created deepspeed aio handle')
-
+    gds = True if args.use_gds else False
+    device_id, folder = args.mapping_list[tid]
+    filename = create_filename(folder, args.read, args.io_size, tid)
+    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+        create_file(filename, args.io_size)
+
+    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+    bounce_buffer = None
     if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
+        device_name = get_accelerator().device_name(device_id)
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
+        if not (args.slow_bounce_buffer or gds):
+            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
+                                          device='cpu').pin_memory()
     else:
-        if args.use_accelerator_pin_memory:
-            buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
-        else:
-            buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid,
+             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+             force=True)
 
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    if gds:
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
+        handle.pin_device_tensor(buffer)
+    else:
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                    not args.sequential_requests, io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
 
     ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
+    ctxt['file'] = filename
+    ctxt['num_bytes'] = args.io_size
     ctxt['handle'] = handle
-    ctxt['buffer'] = buffer
+    ctxt['gds'] = gds
+    ctxt[BUFFER] = buffer
+    ctxt[BOUNCE_BUFFER] = bounce_buffer
     ctxt['elapsed_sec'] = 0
 
-    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
-
     return ctxt
 
 
@@ -61,8 +77,12 @@ def pre_handle_write(pool_params):
 
 def post_handle(pool_params):
     _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
+    for buf in [BUFFER, BOUNCE_BUFFER]:
+        if ctxt[buf] is not None:
+            if ctxt['gds']:
+                ctxt['handle'].unpin_device_tensor(ctxt[buf])
+            ctxt[buf].detach()
+            ctxt[buf] = None
     return ctxt
 
 
@@ -71,20 +91,31 @@ def main_parallel_read(pool_params):
     handle = ctxt['handle']
 
     start_time = time.time()
-    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
     assert ret != -1
     handle.wait()
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
-
     return ctxt
 
 
 def main_parallel_write(pool_params):
     args, tid, ctxt = pool_params
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
     handle = ctxt['handle']
     start_time = time.time()
-    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
     assert ret != -1
     handle.wait()
     end_time = time.time()
@@ -98,8 +129,11 @@ def main_handle_read(pool_parms):
     handle = ctxt['handle']
 
     start_time = time.time()
-    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
     assert ret != -1
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -108,9 +142,18 @@ def main_handle_read(pool_parms):
 
 def main_handle_write(pool_parms):
     args, tid, ctxt = pool_parms
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
     handle = ctxt['handle']
     start_time = time.time()
-    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
     assert ret != -1
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
@@ -123,27 +166,28 @@ def get_schedule(args, read_op):
     if read_op:
         schedule['pre'] = pre_handle_read
         schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+        schedule['main'] = main_parallel_read
     else:
         schedule['pre'] = pre_handle_write
         schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+        schedule['main'] = main_parallel_write
 
     return schedule
 
 
 def _aio_handle_tasklet(pool_params):
     args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
 
     # Create schedule
     schedule = get_schedule(args, read_op)
     task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run pre task
     task_log(tid, f'running pre-task')
     ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run main tasks in a loop
     ctxt["main_task_sec"] = 0
@@ -151,14 +195,14 @@ def _aio_handle_tasklet(pool_params):
         task_log(tid, f'running main task {i}')
         start_time = time.time()
         ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
+        task_barrier(aio_barrier, num_processes)
         stop_time = time.time()
         ctxt["main_task_sec"] += stop_time - start_time
 
     # Run post task
     task_log(tid, f'running post-task')
     ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
@@ -169,9 +213,10 @@ def _init_tasklet(b):
 
 
 def aio_handle_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_job.py b/csrc/aio/py_test/ds_aio_job.py
new file mode 100644
index 0000000000000..bbddee1bf26da
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_job.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import subprocess
+
+
+class Job(object):
+
+    def __init__(self, cmd_line, output_file=None, work_dir=None):
+        self.cmd_line = cmd_line
+        self.output_file = output_file
+        self.work_dir = work_dir
+        self.output_fd = None
+
+    def cmd(self):
+        return self.cmd_line
+
+    def get_stdout(self):
+        return self.output_fd
+
+    def get_stderr(self):
+        return self.output_fd
+
+    def get_cwd(self):
+        return self.work_dir
+
+    def open_output_file(self):
+        if self.output_file is not None:
+            self.output_fd = open(self.output_file, 'w')
+
+    def close_output_file(self):
+        if self.output_fd is not None:
+            self.output_fd.close()
+            self.output_fd = None
+
+
+def run_job(job):
+    args = ' '.join(job.cmd())
+    print(f'args = {args}')
+    job.open_output_file()
+    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    job.close_output_file()
+    assert proc.returncode == 0, \
+    f"This command failed: {job.cmd()}"
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index b9d7e050454a8..59d82996a0e24 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -1,13 +1,22 @@
 #!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <input file> <output log dir>"
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <io_size> <output log dir> <target_gpu>"
     exit 1
 fi
 
+function prep_folder()
+{
+    folder=$1
+    if [[ -d ${folder} ]]; then
+        rm -f ${folder}/*
+    else
+        mkdir -p ${folder}
+    fi
+}
 
 function validate_environment()
 {
-    validate_cmd="python ./validate_async_io.py"
+    validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python3 ./validate_async_io.py"
     eval ${validate_cmd}
     res=$?
     if [[ $res != 0 ]]; then
@@ -17,18 +26,27 @@ function validate_environment()
     fi
 }
 
+function fileExists() {
+    local file="$1"
+    if [[ -f "$file" ]]; then
+        return 0
+    else
+        return 1
+    fi
+}
 
 validate_environment
 
-INPUT_FILE=$1
-if [[ ! -f ${INPUT_FILE} ]]; then
-    echo "Input file not found: ${INPUT_FILE}"
-    exit 1
-fi
-
-LOG_DIR=$2/aio_perf_sweep
+IO_SIZE=$1
+LOG_DIR=./aio_perf_sweep
+MAP_DIR=$2/aio
+GPU_MEM=$3
+USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
-READ_OPT="--read_file ${INPUT_FILE}"
+READ_OPT="--read"
+
+prep_folder ${MAP_DIR}
+prep_folder ${LOG_DIR}
 
 if [[ -d ${LOG_DIR} ]]; then
     rm -f ${LOG_DIR}/*
@@ -36,37 +54,60 @@ else
     mkdir -p ${LOG_DIR}
 fi
 
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
+if [[ ${GPU_MEM} == "gpu" ]]; then
+    gpu_opt="--gpu"
+else
+    gpu_opt=""
+fi
+if [[ ${USE_GDS} == "gds" ]]; then
+    gds_opt="--use_gds"
+else
+    gds_opt=""
+fi
+
+DISABLE_CACHE="sudo sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+SYNC="sudo sync"
 
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
+for xtype in cpu gpu gds; do
+    if [[ $xtype == "cpu" ]]; then
+        gpu_opt=""
+        gds_opt=""
+    elif [[ $xtype == "gpu" ]]; then
+        gpu_opt="--gpu"
+        gds_opt=""
     else
-        sub_opt=""
+        gpu_opt="--gpu"
+        gds_opt="--use_gds"
     fi
-    for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
+    for sub in single block; do
+        if [[ $sub == "single" ]]; then
+            sub_opt="--single_submit"
         else
-            ov_opt=""
+            sub_opt=""
         fi
-        for t in 1 2 4 8; do
-            for p in 1 ; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
-                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
+        for ov in overlap sequential; do
+            if [[ $ov == "sequential" ]]; then
+                ov_opt="--sequential_requests"
+            else
+                ov_opt=""
+            fi
+            for p in 1 2 4 8; do
+                for t in 1 2 4 8; do
+                    for d in 8 16 32 64 128; do
+                        for bs in 128K 256K 512K 1M 2M 4M 8M 16M; do
+                            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme01:0"
+                            OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
+                            LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                            cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
 
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 2
+                            echo ${DISABLE_CACHE}
+                            echo ${cmd}
+                            echo ${SYNC}
+                            eval ${DISABLE_CACHE}
+                            eval ${cmd}
+                            eval ${SYNC}
+                            sleep 2
+                        done
                     done
                 done
             done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
index 99f2113dda6fe..a54d1c8d7bed9 100755
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -25,25 +25,33 @@ function validate_environment()
 
 validate_environment
 
-if [[ $# -ne 3 ]]; then
-    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
-    exit 1
-fi
-
-SIZE="$1M"
-WRITE_DIR=$2
-LOG_DIR=$3/aio_perf_sweep
+IO_SIZE=$1
+LOG_DIR=$2/aio_perf_sweep
+MAP_DIR=$2/aio
+GPU_MEM=$3
+USE_GDS=$4
+RUN_SCRIPT=./test_ds_aio.py
 
-OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
-WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
+OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt
+WRITE_OPT=""
 
 
-prep_folder ${WRITE_DIR}
+prep_folder ${MAP_DIR}
 prep_folder ${LOG_DIR}
 
-RUN_SCRIPT=./test_ds_aio.py
 
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+if [[ ${GPU_MEM} == "gpu" ]]; then
+    gpu_opt="--gpu"
+else
+    gpu_opt=""
+fi
+if [[ ${USE_GDS} == "gds" ]]; then
+    gds_opt="--use_gds"
+else
+    gds_opt=""
+fi
+
+DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
 
 for sub in single block; do
@@ -53,19 +61,19 @@ for sub in single block; do
         sub_opt=""
     fi
     for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
+        if [[ $ov == "sequential" ]]; then
+            ov_opt="--sequential_requests"
         else
             ov_opt=""
         fi
-        for t in 1 2 4 8; do
-            for p in 1; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+        for p in 1 2 4 8; do
+            for t in 1 2 4 8; do
+                for d in 32 64 128; do
+                    for bs in 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
+                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
                         echo ${DISABLE_CACHE}
                         echo ${cmd}
                         echo ${SYNC}
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
index e6242cb357892..6de72755e9e5d 100755
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -6,79 +6,19 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
-import os
-import argparse
 import multiprocessing as mp
 from ds_aio_basic import aio_basic_multiprocessing
 from ds_aio_handle import aio_handle_multiprocessing
-from test_ds_aio_utils import refine_args
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
-
-    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-
-    parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.')
-
-    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
-
-    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-
-    parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.')
-
-    parser.add_argument('--single_submit',
-                        action='store_true',
-                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
-
-    parser.add_argument('--overlap_events',
-                        action='store_true',
-                        help='Overlap I/O submission and completion requests.')
-
-    parser.add_argument('--validate', action='store_true', help='Perform validation in library.')
-
-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-
-    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
-
-    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
-
-    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
-
-    parser.add_argument('--use_accelerator_pin_memory',
-                        action='store_true',
-                        help='Obtain pinned (CPU page-locked) tensors from accelerator')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_args(args):
-    if args.read_file and not os.path.isfile(args.read_file):
-        print(f'args validation error: {args.read_file} not found')
-        return False
-
-    return True
+from ds_aio_args import get_validated_args
 
 
 def main():
     print(f'Testing deepspeed_aio python frontend')
 
-    args = parse_arguments()
-    refine_args(args)
-    if not validate_args(args):
-        quit()
-
+    args = get_validated_args()
     mp.set_start_method('spawn')
     multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
-    if args.read_file:
-        multiprocess_function(args, True)
-
-    if args.write_file:
-        multiprocess_function(args, False)
+    multiprocess_function(args, args.read)
 
 
 if __name__ == "__main__":
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
index 6aad114c0bdc3..968ff4a60ef94 100755
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -6,12 +6,17 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
+import os
+from ds_aio_job import Job, run_job
+
 BYTES_PER_GB = 1024**3
+BYTES_PER_MB = 1024**2
+BYTES_PER_KB = 1024
 LOG_TIDS = [0]
 
 
-def task_log(tid, msg):
-    if tid in LOG_TIDS:
+def task_log(tid, msg, force=False):
+    if force or tid in LOG_TIDS:
         print(f'tid {tid}: {msg}')
 
 
@@ -31,16 +36,29 @@ def report_results(args, read_op, pool_results):
     total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
 
     task_latency_sec = max([sec for _, sec, _ in pool_results])
-    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
+    task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB
     print(f'Task {io_string} Latency = {task_latency_sec} sec')
     print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
 
     e2e_latency_sec = max([sec for sec, _, _ in pool_results])
-    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
+    e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB
     print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
     print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
 
 
+def get_block_size_and_count(io_bytes):
+    if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0:
+        block_size = BYTES_PER_MB
+        block_size_string = '1M'
+    else:
+        assert io_bytes % BYTES_PER_KB == 0
+        block_size = BYTES_PER_KB
+        block_size_string = '1K'
+    block_count = io_bytes / block_size
+
+    return block_size_string, int(block_count)
+
+
 def refine_integer_value(value):
     unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
 
@@ -50,9 +68,14 @@ def refine_integer_value(value):
     return int(value)
 
 
-def refine_args(args):
-    if args.write_size and type(args.write_size) == str:
-        args.write_size = refine_integer_value(args.write_size)
+def create_filename(folder, read_op, size, tid):
+    io_string = "read" if read_op else "write"
+    return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}')
+
 
-    if args.block_size and type(args.block_size) == str:
-        args.block_size = refine_integer_value(args.block_size)
+def create_file(filename, num_bytes):
+    block_size, block_count = get_block_size_and_count(num_bytes)
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}'])
+    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
+    run_job(dd_job)
+    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py
index 019ec05d49d31..10fb638347bcd 100644
--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -7,3 +7,4 @@
 """
 from deepspeed.ops.op_builder import AsyncIOBuilder
 assert AsyncIOBuilder().is_compatible()
+assert AsyncIOBuilder().load()
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
index 2a4300c5cac13..c102234a4dfb0 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -125,11 +125,10 @@ struct CheckArch {
         std::cerr << #PTR " is not correctly aligned\n"; \
         return false;                                    \
     }
-#define EVOFORMER_CHECK(COND, ERR)                          \
-    if (!(COND)) {                                          \
-        std::cerr << "[Evoformer Attention]"                \
-                  << "'" #COND "' failed: " << ERR << "\n"; \
-        return false;                                       \
+#define EVOFORMER_CHECK(COND, ERR)                                                     \
+    if (!(COND)) {                                                                     \
+        std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                                                  \
     }
 #endif
 
diff --git a/csrc/fp_quantizer/fp_quantize.cpp b/csrc/fp_quantizer/fp_quantize.cpp
index 6962b8050f517..1a887b50e1a3d 100644
--- a/csrc/fp_quantizer/fp_quantize.cpp
+++ b/csrc/fp_quantizer/fp_quantize.cpp
@@ -22,25 +22,20 @@
                                                        stochastic_rounding);             \
     }
 
-at::Tensor quantize(torch::Tensor& val,
+at::Tensor quantize(torch::Tensor& out,
+                    torch::Tensor& val,
                     int group_size,
                     int stochastic_rounding,
                     int q_bits,
                     int q_mantisa_bits)
 {
     int total_elems = at::numel(val);
-    auto options = at::TensorOptions()
-                       .dtype(torch::kInt8)
-                       .layout(val.layout())
-                       .device(val.device())
-                       .requires_grad(false);
     float q_range = q_bits == 8 ? (q_mantisa_bits == 3 ? 480.0 : 114688.0) :  // fp8 ranges
                         (q_bits == 12 ? 510.0 :                               // fp12 range
                              (q_bits == 6 ? 28.0 :                            // fp6 range
                                   6.0));  // fp4 range (using power 2); TODO (Reza): add the power-4
                                           // in case accuracy is not matching!
     int num_groups = total_elems / group_size;
-    auto out = torch::empty({num_groups, group_size * q_bits / 8 + 4}, options);
 
     DISPATCH_QUANTIZE(kHalf, __half, 23, 8);
 #ifdef BF16_AVAILABLE
@@ -108,9 +103,22 @@ void selective_dequantize(torch::Tensor& val,
 #endif
 }
 
+at::Tensor get_scales(torch::Tensor& out, int num_groups)
+{
+    auto options = at::TensorOptions()
+                       .dtype(torch::kFloat)
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto scales =
+        torch::from_blob(out.data_ptr(), {num_groups, 1}, {out.stride(0) / 4, 1}, options);
+    return scales;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("quantize", &quantize, "quantize function");
     m.def("dequantize", &dequantize, "dequantize function");
+    m.def("get_scales", &get_scales, "get scales function");
     m.def("selective_dequantize", &selective_dequantize, "selective dequantize function");
 }
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
new file mode 100644
index 0000000000000..c370a448e5a21
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -0,0 +1,154 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_gds_op.h"
+
+using namespace std;
+
+// For when there is more than 1 device
+static std::map<const int64_t, std::set<void*>> base_ptr_registry;
+
+static void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle)
+{
+    memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t));
+    cf_descr.handle.fd = fd;
+    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+    CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+        close(fd);
+        exit(EXIT_FAILURE);
+    }
+}
+
+static void* _find_base_ptr(const int64_t device, char* buf_ptr)
+{
+    void* base_ptr = nullptr;
+    int64_t last = -1;
+    int64_t ptr_diff;
+    for (const auto& value : base_ptr_registry[device]) {
+        ptr_diff = buf_ptr - (char*)value;
+        if (last == -1 && ptr_diff >= 0) {
+            last = ptr_diff;
+            base_ptr = value;
+        } else if (ptr_diff < last && ptr_diff >= 0) {
+            last = ptr_diff;
+            base_ptr = value;
+        }
+    }
+    if (!base_ptr || buf_ptr < base_ptr) {
+        std::cerr << "BASE PTR ERROR :" << base_ptr << " BUF PTR " << (void*)buf_ptr << std::endl;
+        for (const auto& value : base_ptr_registry[device]) {
+            std::cerr << "BASE PTR AVAIL :" << value << std::endl;
+        }
+        exit(EXIT_FAILURE);
+    }
+
+    return base_ptr;
+}
+
+void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer)
+{
+    const int64_t device = buffer.get_device();
+    void* reg_ptr = buffer.data_ptr();
+
+    // std::cout << "REG PTR " <<  reg_ptr << std::endl;
+    // TODO: add checking to make sure pointer isn't already in set
+    const auto it = base_ptr_registry.find(device);
+    if (it == base_ptr_registry.end()) {
+        std::set<void*> new_ptr_set;
+        new_ptr_set.insert(reg_ptr);
+        base_ptr_registry.insert(std::pair<const int64_t, std::set<void*>>(device, new_ptr_set));
+    } else {
+        base_ptr_registry[device].insert(reg_ptr);
+    }
+
+    check_cudaruntimecall(cudaSetDevice(device));
+    CUfileError_t status = cuFileBufRegister(reg_ptr, buffer.nbytes(), 0);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "buffer register failed:" << cuFileGetErrorString(status) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void gds_op_desc_t::remove_buffer_from_registry(const torch::Tensor& buffer)
+{
+    const int64_t device = buffer.get_device();
+    void* reg_ptr = buffer.data_ptr();
+
+    // std::cout << "DEREG PTR " <<  reg_ptr << std::endl;
+    check_cudaruntimecall(cudaSetDevice(device));
+    cuFileBufDeregister(reg_ptr);
+
+    // Remove from tracked registry
+    base_ptr_registry[device].erase(reg_ptr);
+}
+
+gds_op_desc_t::gds_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const long long int file_num_bytes,
+                             const int num_threads,
+                             const bool validate)
+    : io_op_desc_t(read_op, buffer, fd, filename, file_num_bytes, num_threads, validate)
+{
+    _contiguous_buffer = _buffer.contiguous();
+    const int64_t device = _buffer.get_device();
+    check_cudaruntimecall(cudaSetDevice(device));
+    _base_ptr = _find_base_ptr(device, (char*)_contiguous_buffer.data_ptr());
+
+    _safe_handle_register(fd, _cf_descr, _cf_handle);
+}
+
+char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void gds_op_desc_t::finish() { cuFileHandleDeregister(_cf_handle); }
+
+void gds_op_desc_t::validate()
+{
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    const auto cpu_buffer = _buffer.to(torch::kCPU);
+    validate_aio_operation(
+        _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes);
+}
+
+void gds_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _num_threads);
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    int64_t buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
+    const auto file_offset = _num_bytes_per_thread * tid;
+
+    if (_read_op) {
+        auto ret =
+            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+    } else {
+        auto ret =
+            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, buf_offset); }
+    }
+}
+
+void gds_op_desc_t::_report_error(const ssize_t return_code,
+                                  const int error_num,
+                                  const off_t offset)
+{
+    const auto op_string = _read_op ? "read failed with " : "write failed with ";
+    const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: ";
+    const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code)
+                                                       : cuFileGetErrorString(error_num);
+    std::cerr << op_string << error_string << error_code << " return code = " << return_code
+              << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread
+              << " offset = " << offset << std::endl;
+    exit(EXIT_FAILURE);
+}
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
new file mode 100644
index 0000000000000..b7fab64d40549
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cstdlib>
+#include <fstream>
+#include <memory>
+#include <queue>
+#include <set>
+#include <string>
+
+#include "deepspeed_aio_op_desc.h"
+#include "deepspeed_gds_utils.h"
+
+struct gds_op_desc_t : io_op_desc_t {
+    CUfileDescr_t _cf_descr;
+    CUfileHandle_t _cf_handle;
+    void* _base_ptr;
+
+    gds_op_desc_t(const bool read_op,
+                  const torch::Tensor& buffer,
+                  const int fd,
+                  const char* filename,
+                  const long long int file_num_bytes,
+                  const int num_threads,
+                  const bool validate);
+
+    void run(const int tid,
+             std::unique_ptr<aio_context>& aio_ctxt,
+             deepspeed_aio_config_t* aio_config);
+
+    char* data_ptr() const;
+
+    void validate();
+
+    void finish();
+
+    void _report_error(const ssize_t return_code, const int error_num, const off_t offset);
+
+    static void add_buffer_to_registry(const torch::Tensor& buffer);
+
+    static void remove_buffer_from_registry(const torch::Tensor& buffer);
+};
diff --git a/csrc/gds/py_lib/deepspeed_gds_utils.h b/csrc/gds/py_lib/deepspeed_gds_utils.h
new file mode 100644
index 0000000000000..12b014d909880
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_gds_utils.h
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cstring>
+
+// CUDA/cuFile includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cufile.h"
+
+// Macro for checking cuda errors following a cuda launch or api call
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess) {                                                              \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(EXIT_FAILURE);                                                              \
+        }                                                                                    \
+    }
+
+#define check_cudadrivercall(fn)                                                           \
+    do {                                                                                   \
+        CUresult res = fn;                                                                 \
+        if (res != CUDA_SUCCESS) {                                                         \
+            const char* str = nullptr;                                                     \
+            cuGetErrorName(res, &str);                                                     \
+            std::cerr << "cuda driver api call failed " << #fn << " res : " << res << ", " \
+                      << __LINE__ << ":" << str << std::endl;                              \
+            std::cerr << "EXITING program!!!" << std::endl;                                \
+            exit(1);                                                                       \
+        }                                                                                  \
+    } while (0)
+
+#define check_cudaruntimecall(fn)                                                         \
+    do {                                                                                  \
+        cudaError_t res = fn;                                                             \
+        if (res != cudaSuccess) {                                                         \
+            const char* str = cudaGetErrorName(res);                                      \
+            std::cerr << "cuda runtime api call failed " << #fn << __LINE__ << ":" << str \
+                      << std::endl;                                                       \
+            std::cerr << "EXITING program!!!" << std::endl;                               \
+            exit(1);                                                                      \
+        }                                                                                 \
+    } while (0)
+
+#define check_cuFileCall(fn, api_msg)                                                  \
+    do {                                                                               \
+        CUfileError_t status = fn;                                                     \
+        if (status.err != CU_FILE_SUCCESS) {                                           \
+            std::cout << api_msg << " failed with error " << CUFILE_ERRSTR(status.err) \
+                      << std::endl;                                                    \
+            exit(EXIT_FAILURE);                                                        \
+        }                                                                              \
+    } while (0)
+
+//
+// cuda driver error description
+//
+static inline const char* GetCuErrorString(CUresult curesult)
+{
+    const char* descp;
+    if (cuGetErrorName(curesult, &descp) != CUDA_SUCCESS) descp = "unknown cuda error";
+    return descp;
+}
+
+//
+// cuFile APIs return both cuFile specific error codes as well as POSIX error codes
+// for ease, the below template can be used for getting the error description depending
+// on its type.
+
+// POSIX
+template <class T,
+          typename std::enable_if<std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
+std::string cuFileGetErrorString(T status)
+{
+    status = std::abs(status);
+    return IS_CUFILE_ERR(status) ? std::string(CUFILE_ERRSTR(status))
+                                 : std::string(std::strerror(status));
+}
+
+// CUfileError_t
+template <class T,
+          typename std::enable_if<!std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
+std::string cuFileGetErrorString(T status)
+{
+    std::string errStr = cuFileGetErrorString(static_cast<int>(status.err));
+    if (IS_CUDA_ERR(status)) errStr.append(".").append(GetCuErrorString(status.cu_err));
+    return errStr;
+}
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
new file mode 100644
index 0000000000000..3a35ad3145a0d
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -0,0 +1,114 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+    GPUDirect Storage functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_gds_handle.h"
+#include <cstdlib>
+#include "deepspeed_gds_op.h"
+
+using namespace std;
+
+int deepspeed_gds_handle_t::s_cuFile_init = 0;
+
+deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, num_threads)
+{
+    _init_cuFile(block_size, queue_depth, num_threads);
+}
+
+deepspeed_gds_handle_t::~deepspeed_gds_handle_t() { _close_cuFile(); }
+
+void deepspeed_gds_handle_t::_init_cuFile(const int block_size,
+                                          const int queue_depth,
+                                          const int num_threads)
+{
+    if (deepspeed_gds_handle_t::s_cuFile_init == 0) {
+        std::string depthStr = std::to_string(queue_depth);
+        std::string threadsStr = std::to_string(num_threads);
+        std::string json1 = R"({"execution": {"max_io_queue_depth": )" + depthStr + ", ";
+        std::string json2 = R"("max_request_parallelism": )" + threadsStr + ", ";
+        std::string json3 = R"("max_io_threads": )" + threadsStr + ", ";
+        std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
+        std::ofstream outFile("local_cufile.json");
+        if (outFile.is_open()) {
+            outFile << json1 + json2 + json3 + json4;
+            outFile.close();
+        } else {
+            std::cerr << "Can't open local cufile" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        // TODO: Address the following issues with this code
+        // (1) Fix C++14 warning
+        // (2) Create file in a different location than PWD
+        // (3) Handle multi-GPU/multi-rank scenarios: should cufile be shared, is per-rank cufile
+        // safe?
+        putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
+        cuFileDriverOpen();
+        cudaCheckError();
+        size_t direct_io_size = (size_t)block_size / 1024;
+        CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
+        if (status.err != CU_FILE_SUCCESS) {
+            std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+    deepspeed_gds_handle_t::s_cuFile_init++;
+}
+
+void deepspeed_gds_handle_t::_close_cuFile()
+{
+    deepspeed_gds_handle_t::s_cuFile_init--;
+    if (deepspeed_gds_handle_t::s_cuFile_init == 0) { cuFileDriverClose(); }
+}
+
+torch::Tensor deepspeed_gds_handle_t::new_pinned_device_tensor(const size_t num_elem,
+                                                               const torch::Tensor& example_tensor)
+{
+    auto options = torch::TensorOptions().dtype(example_tensor.scalar_type()).device(torch::kCUDA);
+    auto dev_tensor = torch::empty(num_elem, options);
+    pin_device_tensor(dev_tensor);
+    return dev_tensor;
+}
+
+bool deepspeed_gds_handle_t::free_pinned_device_tensor(torch::Tensor& buffer)
+{
+    unpin_device_tensor(buffer);
+    return true;
+}
+
+bool deepspeed_gds_handle_t::pin_device_tensor(const torch::Tensor& buffer)
+{
+    gds_op_desc_t::add_buffer_to_registry(buffer);
+    return true;
+}
+
+bool deepspeed_gds_handle_t::unpin_device_tensor(const torch::Tensor& buffer)
+{
+    gds_op_desc_t::remove_buffer_from_registry(buffer);
+    return true;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const long long int file_num_bytes,
+    const bool validate)
+{
+    if (buffer.is_cuda()) {
+        return std::make_shared<gds_op_desc_t>(
+            read_op, buffer, fd, filename, file_num_bytes, _num_threads, validate);
+    }
+    return deepspeed_io_handle_t::_create_io_op_desc(
+        read_op, buffer, fd, filename, file_num_bytes, validate);
+}
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
new file mode 100644
index 0000000000000..f324e6b65e80b
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_py_io_handle.h"
+
+struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
+    deepspeed_gds_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    ~deepspeed_gds_handle_t();
+
+    torch::Tensor new_pinned_device_tensor(const size_t num_elem,
+                                           const torch::Tensor& example_tensor);
+
+    bool free_pinned_device_tensor(torch::Tensor&);
+
+    bool pin_device_tensor(const torch::Tensor& buffer);
+
+    bool unpin_device_tensor(const torch::Tensor& buffer);
+
+    void _init_cuFile(const int block_size, const int queue_length, const int num_threads);
+
+    void _close_cuFile();
+
+    std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                            const torch::Tensor& buffer,
+                                                            const int fd,
+                                                            const char* filename,
+                                                            const long long int file_num_bytes,
+                                                            const bool validate);
+
+    static int s_cuFile_init;
+};
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
new file mode 100644
index 0000000000000..66eb34d4ea8cf
--- /dev/null
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_gds_handle.h"
+using namespace pybind11::literals;
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    py::class_<deepspeed_gds_handle_t>(m, "gds_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>(),
+             "GDS handle constructor",
+             "block_size"_a = 1024 * 1024,
+             "queue_depth"_a = 128,
+             "single_submit"_a = false,
+             "overlap_events"_a = false,
+             "num_threads"_a = 1)
+
+        .def("get_block_size", &deepspeed_gds_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_gds_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events)
+        .def("get_thread_count", &deepspeed_gds_handle_t::get_thread_count)
+
+        .def("read",
+             &deepspeed_gds_handle_t::read,
+             "Synchronous and non-parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
+
+        .def("write",
+             &deepspeed_gds_handle_t::write,
+             "Synchronous and non-parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a)
+
+        .def("pread",
+             &deepspeed_gds_handle_t::pread,
+             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
+
+        .def("pwrite",
+             &deepspeed_gds_handle_t::pwrite,
+             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a)
+
+        .def("sync_pread",
+             &deepspeed_gds_handle_t::sync_pread,
+             "Synchrononous parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("sync_pwrite",
+             &deepspeed_gds_handle_t::sync_pwrite,
+             "Synchronous parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pread",
+             &deepspeed_gds_handle_t::async_pread,
+             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
+             "following wait() returns count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("async_pwrite",
+             &deepspeed_gds_handle_t::async_pwrite,
+             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
+             "count of completed ops.",
+             "buffer"_a,
+             "filename"_a)
+
+        .def("new_cpu_locked_tensor",
+             &deepspeed_gds_handle_t::new_cpu_locked_tensor,
+             "Allocate pinned CPU tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_cpu_locked_tensor",
+             &deepspeed_gds_handle_t::free_cpu_locked_tensor,
+             "Free pinned CPU tensor.",
+             "tensor"_a)
+
+        .def("new_pinned_device_tensor",
+             &deepspeed_gds_handle_t::new_pinned_device_tensor,
+             "Allocate pinned device tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_pinned_device_tensor",
+             &deepspeed_gds_handle_t::free_pinned_device_tensor,
+             "Free pinned device tensor.",
+             "tensor"_a)
+
+        .def("pin_device_tensor",
+             &deepspeed_gds_handle_t::pin_device_tensor,
+             "Pin device tensor.",
+             "tensor"_a)
+
+        .def("unpin_device_tensor",
+             &deepspeed_gds_handle_t::unpin_device_tensor,
+             "Unpin device tensor.",
+             "tensor"_a)
+
+        .def("wait",
+             &deepspeed_gds_handle_t::wait,
+             "Wait for (ongoing) asynchronous operations to complete");
+}
diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py
new file mode 100644
index 0000000000000..b34b1194f5821
--- /dev/null
+++ b/csrc/gds/py_test/validate_gds.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+from deepspeed.ops.op_builder import GDSBuilder
+assert GDSBuilder().is_compatible(True)
+assert GDSBuilder().load(True)
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index f5bfb45dd2e20..a205026ec7c11 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -27,7 +27,7 @@ inline void writeAs(void* dst, const T& val)
     std::memcpy(dst, &val, sizeof(T));
 }
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
old mode 100755
new mode 100644
index f77568be7835e..097e2d8585ccb
--- a/csrc/xpu/includes/simd.h
+++ b/csrc/xpu/includes/simd.h
@@ -13,7 +13,7 @@
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
index fa41757c895b5..1897afd1fea24 100644
--- a/csrc/xpu/includes/type_shim.h
+++ b/csrc/xpu/includes/type_shim.h
@@ -82,11 +82,11 @@
     }
 
 template <typename T>
-__inline__ __attribute__((always_inline)) T reduce_block_into_lanes(
-    T* x,
-    T val,
-    int lanes = 1,
-    bool share_result = false)  // lanes is intended to be <= 32.
+__inline__ __attribute__((always_inline)) T
+reduce_block_into_lanes(T* x,
+                        T val,
+                        int lanes = 1,
+                        bool share_result = false)  // lanes is intended to be <= 32.
 {
     auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
     int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
diff --git a/deepspeed/inference/v2/checkpoint/huggingface_engine.py b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
index 46a84c61f884b..77dfa7a23b1e0 100644
--- a/deepspeed/inference/v2/checkpoint/huggingface_engine.py
+++ b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
@@ -15,13 +15,13 @@
 
 class HuggingFaceCheckpointEngine(CheckpointEngineBase):
 
-    def __init__(self, model_name_or_path: str, auth_token: str = None) -> None:
+    def __init__(self, model_name_or_path: str, auth_token: str = None, **hf_kwargs) -> None:
         super().__init__()
         from transformers import AutoConfig, GenerationConfig
 
         self.model_name_or_path = model_name_or_path
         self.auth_token = auth_token
-        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path)
+        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path, **hf_kwargs)
         # Define this property here so we can use it in the model implementation
         if not hasattr(self.model_config, "max_seq_length"):
             if hasattr(self.model_config, "max_position_embeddings"):
diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py
index c320108f55e50..314f7f2f0485f 100644
--- a/deepspeed/inference/v2/engine_factory.py
+++ b/deepspeed/inference/v2/engine_factory.py
@@ -20,8 +20,10 @@
     MixtralPolicy,
     FalconPolicy,
     PhiPolicy,
+    Phi3Policy,
     QwenPolicy,
     Qwen2Policy,
+    Qwen2MoePolicy,
 )
 from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
 from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
@@ -119,10 +121,14 @@ def build_hf_engine(path: str,
             policy = FalconPolicy(model_config, checkpoint_engine=checkpoint_engine)
         elif model_config.model_type == "phi":
             policy = PhiPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "phi3":
+            policy = Phi3Policy(model_config, checkpoint_engine=checkpoint_engine)
         elif model_config.model_type == "qwen":
             policy = QwenPolicy(model_config, checkpoint_engine=checkpoint_engine)
         elif model_config.model_type == "qwen2":
             policy = Qwen2Policy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "qwen2_moe":
+            policy = Qwen2MoePolicy(model_config, checkpoint_engine=checkpoint_engine)
         else:
             raise ValueError(f"Unsupported model type {model_config.model_type}")
 
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h b/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h
index abb9e15f8f6f9..2cc430ccfe349 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h
+++ b/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h
@@ -11,5 +11,8 @@
         } else if (2 == N_TOP_K) {         \
             constexpr int CONST_TOP_K = 2; \
             __VA_ARGS__();                 \
+        } else if (4 == N_TOP_K) {         \
+            constexpr int CONST_TOP_K = 4; \
+            __VA_ARGS__();                 \
         }                                  \
     }()
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
index 7fe38d258e6c2..7e1ec1a13cb95 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
@@ -18,7 +18,7 @@ class BlockedRotaryEmbeddings(DSKernelBase):
     """
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
-    supported_head_sizes = [64, 80, 128]
+    supported_head_sizes = [64, 80, 96, 128]
     supported_q_ratios = [1, 2, 4, 5, 8, 16, 29, 35, 36, 71]
 
     def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype, rotary_dim: int,
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu
index 6f94838f55e9c..fbafece5ccf23 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu
@@ -223,6 +223,8 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
         LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 64);  \
     } else if (head_size == 80) {                             \
         LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 80);  \
+    } else if (head_size == 96) {                             \
+        LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 96);  \
     } else if (head_size == 128) {                            \
         LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 128); \
     } else {                                                  \
@@ -326,6 +328,8 @@ INSTANTIATE_KV_ROTARY_KERNEL(__nv_bfloat16)
         LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 64);  \
     } else if (head_size == 80) {                           \
         LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 80);  \
+    } else if (head_size == 96) {                           \
+        LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 96);  \
     } else if (head_size == 128) {                          \
         LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 128); \
     } else {                                                \
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
index f8c5b2b138048..f527be227ce1d 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
@@ -23,7 +23,7 @@ class BlockedTrainedRotaryEmbeddings(DSKernelBase):
     """
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
-    supported_head_sizes = [64, 80, 128]
+    supported_head_sizes = [64, 80, 96, 128]
     supported_q_ratios = [1, 2, 4, 5, 8]
 
     def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
index a885eadd78a19..4b2ad858a1bf0 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
@@ -23,7 +23,7 @@ class LinearBlockedKVCopy(DSKernelBase):
     """
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
-    supported_head_sizes = [64, 80, 128]
+    supported_head_sizes = [64, 80, 96, 128]
     supported_q_ratios = [1, 2, 4, 5, 8]
 
     def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py
index 14b0654a8c36c..3483d9348c555 100644
--- a/deepspeed/inference/v2/model_implementations/__init__.py
+++ b/deepspeed/inference/v2/model_implementations/__init__.py
@@ -15,5 +15,7 @@
 from .mixtral import *
 from .falcon import *
 from .phi import *
+from .phi3 import *
 from .qwen import *
 from .qwen_v2 import *
+from .qwen_v2_moe import *
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
index ddb8996e03a37..17def1fa021fb 100644
--- a/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
@@ -66,6 +66,24 @@ def finalize(self) -> torch.Tensor:
         return self.inference_model.transform_mlp_1_param(fused_param)
 
 
+class FusedGatedMLPParameter(ParameterBase):
+    """
+    Gated MLP projection container.
+    """
+
+    params: torch.Tensor
+    """
+    Weight parameter for the fused gating and non-gating weight parameters.
+    """
+
+    def finalize(self) -> torch.Tensor:
+        gate_params = self.params[:self.params.shape[0] // 2]
+        up_params = self.params[self.params.shape[0] // 2:]
+        total_neurons = gate_params.shape[0] + up_params.shape[0]
+        fused_param = torch.cat([gate_params, up_params], dim=-1).reshape(total_neurons, -1)
+        return self.inference_model.transform_mlp_1_param(fused_param)
+
+
 class MLP2Parameter(ParameterBase):
     """
     Second MLP projection weight container. This performs a straight pass-through to the
diff --git a/deepspeed/inference/v2/model_implementations/phi3/__init__.py b/deepspeed/inference/v2/model_implementations/phi3/__init__.py
new file mode 100644
index 0000000000000..1a4b756d210c5
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Phi3Policy
diff --git a/deepspeed/inference/v2/model_implementations/phi3/containers.py b/deepspeed/inference/v2/model_implementations/phi3/containers.py
new file mode 100644
index 0000000000000..1cb52a75ae0b8
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/containers.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Phi-3 model looks like this:
+
+Phi3ForCausalLM(
+  (model): Phi3Model(
+    (embed_tokens): Embedding(32064, 3072)
+    (embed_dropout): Dropout(p=0.0, inplace=False)
+    (layers): ModuleList(
+      (0-31): 32 x Phi3DecoderLayer(
+        (self_attn): Phi3Attention(
+          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
+          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
+          (rotary_emb): Phi3RotaryEmbedding()
+        )
+        (mlp): PhiMLP(
+          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
+          (down_proj): Linear(in_features=16384, out_features=3072, bias=False)
+          (activation_fn): SiLU()
+        )
+        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
+        (resid_attn_dropout): Dropout(p=0.0)
+        (resid_mlp_dropout): Dropout(p=0.0)
+        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
+      )
+    )
+    (final_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
+  )
+  (lm_head): Linear(in_features=3072, out_features=32064, bias=False)
+)
+'''
+
+
+class Phi3TransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Phi model.
+    """
+    qkv_w: FusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: FusedGatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.qkv_proj.weight": "qkv_w.params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate_up_proj.weight": "mlp_1_w.params",
+        "mlp.down_proj.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Phi3NonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Phi model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed_w: UnembedParameter
+    final_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm_gamma.params",
+        "lm_head.weight": "word_unembed_w.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/phi3/model.py b/deepspeed/inference/v2/model_implementations/phi3/model.py
new file mode 100644
index 0000000000000..507bb4fc9af1a
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/model.py
@@ -0,0 +1,204 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+
+from .containers import Phi3NonTransformerContainer, Phi3TransformerContainer
+
+
+class Phi3InferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[Phi3NonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Phi3TransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties inherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties inherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        activation = self._config.hidden_act.lower()
+        if activation == "gelu":
+            return ActivationType.GEGLU
+        elif activation == "relu":
+            return ActivationType.ReGLU
+        elif activation == "gegelu":
+            return ActivationType.GEGLU
+        elif activation == "silu":
+            return ActivationType.SiGLU
+        else:
+            raise NotImplementedError(f"Activation {activation} not supported")
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=None)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed_w,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm_gamma)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, gamma=self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/phi3/policy.py b/deepspeed/inference/v2/model_implementations/phi3/policy.py
new file mode 100644
index 0000000000000..a1b445929053a
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .containers import Phi3NonTransformerContainer, Phi3TransformerContainer
+from .model import Phi3InferenceModel
+
+
+class Phi3Policy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Phi3InferenceModel:
+        return Phi3InferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Phi3TransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Phi3NonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
new file mode 100644
index 0000000000000..23e06a7700237
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Qwen2MoePolicy
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py
new file mode 100644
index 0000000000000..b4621257ff825
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Qwen1.5-MoE-A2.7B model looks like this:
+
+Qwen2MoeForCausalLM(
+  (model): Qwen2MoeModel(
+    (embed_tokens): Embedding(151936, 2048)
+    (layers): ModuleList(
+      (0-23): 24 x Qwen2MoeDecoderLayer(
+        (self_attn): Qwen2MoeSdpaAttention(
+          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
+          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
+          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
+          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+          (rotary_emb): Qwen2MoeRotaryEmbedding()
+        )
+        (mlp): Qwen2MoeSparseMoeBlock(
+          (gate): Linear(in_features=2048, out_features=60, bias=False)
+          (experts): ModuleList(
+            (0-59): 60 x Qwen2MoeMLP(
+              (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
+              (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
+              (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+          (shared_expert): Qwen2MoeMLP(
+            (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
+            (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
+            (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
+            (act_fn): SiLU()
+          )
+          (shared_expert_gate): Linear(in_features=2048, out_features=1, bias=False)
+        )
+        (input_layernorm): Qwen2MoeRMSNorm()
+        (post_attention_layernorm): Qwen2MoeRMSNorm()
+      )
+    )
+    (norm): Qwen2MoeRMSNorm()
+  )
+  (lm_head): Linear(in_features=2048, out_features=151936, bias=False)
+)
+'''
+
+
+class Qwen2MoeTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Qwen2Moe model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    moe_gate: MoEGatingWeightParameter
+    moe_mlp_1: UnfusedMoEGatedMLPParameter
+    moe_mlp_2: UnfusedMoEMLP2Parameter
+    shared_moe_mlp_1: GatedMLPParameter
+    shared_moe_mlp_2: MLP2Parameter
+    shared_moe_gate: MoEGatingWeightParameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate.weight": "moe_gate.params",
+        "mlp.experts.*.gate_proj.weight": "moe_mlp_1.gating_experts",
+        "mlp.experts.*.up_proj.weight": "moe_mlp_1.up_experts",
+        "mlp.experts.*.down_proj.weight": "moe_mlp_2.experts",
+        "mlp.shared_expert.gate_proj.weight": "shared_moe_mlp_1.gate_params",
+        "mlp.shared_expert.up_proj.weight": "shared_moe_mlp_1.up_params",
+        "mlp.shared_expert.down_proj.weight": "shared_moe_mlp_2.params",
+        "mlp.shared_expert_gate.weight": "shared_moe_gate.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Qwen2MoeNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Qwen2Moe model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py
new file mode 100644
index 0000000000000..7cddbf9783692
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py
@@ -0,0 +1,359 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...config_v2 import RaggedInferenceEngineConfig
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...modules import heuristics
+from ...ragged import RaggedBatchWrapper
+from ..inference_model_base import (
+    DSModelImplementationConfig,
+    MPType,
+)
+
+from .container import Qwen2MoeNonTransformerContainer, Qwen2MoeTransformerContainer
+
+
+class Qwen2MoeInferenceModel(DSMoETransformerModelBase):
+    """
+    Inference model implementation for Qwen2MoE models.
+    """
+
+    _non_transformer: Optional[Qwen2MoeNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Qwen2MoeTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_position_embeddings
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        # TODO(ZonePG): bf16 inference results may be different from huggingface bf16,
+        # because in rms_norm, Qwen still use float() instead of bf16
+        # if self._config.torch_dtype == torch.float16:
+        #     return DtypeEnum.fp16
+        # elif self._config.torch_dtype == torch.bfloat16:
+        #     return DtypeEnum.bf16
+        # else:
+        #     raise NotImplementedError("Only fp16 and bf16 are supported")
+        return DtypeEnum.fp16
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.SiGLU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    """
+    Inherited from `DSMoETransformerModelBase`
+    """
+
+    @property
+    def n_experts(self) -> int:
+        return self._config.num_experts
+
+    @property
+    def n_top_k(self) -> int:
+        return self._config.num_experts_per_tok
+
+    @property
+    def normalize_expert_scores(self) -> bool:
+        return self._config.norm_topk_prob
+
+    def make_moe_layer(self) -> None:
+        """
+        Instantiates the MoE layer for the model. This sets the `self.moe` attribute.
+        """
+        sharded_dim = sharded_intermediate_dim(self.intermediate_dim // self.n_top_k, self.tp_size, self.tp_rank)
+
+        moe_config = DSMoEConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            model_dim=self.model_dim,
+            intermediate_features=sharded_dim,
+            activation=self.mlp_activation_fn,
+            n_experts=self.n_experts,
+            top_k=self.n_top_k,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            normalize_scores=self.normalize_expert_scores,
+        )
+
+        self.moe = heuristics.instantiate_moe(moe_config, self._engine_config)
+
+    ######### MLP 1 #########
+    def make_shared_expert_mlp_1_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the first MLP in the feedforward network.
+        This sets the `self.mlp_1` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=self.model_dim,
+            out_channels=shard_size,
+            activation=self.mlp_activation_fn,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_mlp_1 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    ######### MLP 2 #########
+    def make_shared_expert_mlp_2_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=self.model_dim,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_mlp_2 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    ######### MLP 2 #########
+    def make_shared_expert_gate_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.model_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=8,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_gate = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            eps=self._config.rms_norm_eps,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    """
+    Model implementation
+    """
+
+    def __init__(self, config: DSModelImplementationConfig, engine_config: RaggedInferenceEngineConfig,
+                 base_mp_group: MPType) -> None:
+        """
+        Base implementation for initialization. By default, this will initialize
+        the traditional components of a transformer model:
+            - Embedding
+            - QKV projection
+            - Self attention
+            - Attention output projection
+            - Feed forward network
+            - Normalization
+            - Unembedding
+
+        Arguments:
+            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
+                should be made about this config that are not closely tied to the specific
+                model implementation.
+            engine_config (RaggedInferenceEngineConfig): Engine configuration.
+            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
+        """
+        super().__init__(config, engine_config, base_mp_group)
+
+        self.make_norm_layer()
+        self.make_qkv_layer()
+        self.make_attn_layer()
+        self.make_attn_out_layer()
+        self.make_moe_layer()
+        self.make_shared_expert_mlp_1_layer()
+        self.make_shared_expert_mlp_2_layer()
+        self.make_shared_expert_gate_layer()
+        self.make_embedding_layer()
+        self.make_unembedding_layer()
+        self._kv_cache_config = None
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                             ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        shared_expert_output = self.shared_expert_mlp_1(hidden_states, cur_params.shared_moe_mlp_1, b=None)
+        shared_expert_output = self.shared_expert_mlp_2(shared_expert_output, cur_params.shared_moe_mlp_2, b=None)
+        shared_expert_gate_output = self.shared_expert_gate(hidden_states, cur_params.shared_moe_gate, b=None)[..., :1]
+        # shared_expert_gate_output shape[-1] is 1
+        shared_expert_output.mul_(torch.sigmoid(shared_expert_gate_output))
+        hidden_states = self.moe(hidden_states, ragged_batch_info, cur_params.moe_gate, cur_params.moe_mlp_1,
+                                 cur_params.moe_mlp_2)
+        hidden_states.add_(shared_expert_output)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer(layer_idx, residual, hidden_states, wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py
new file mode 100644
index 0000000000000..630bafe993a88
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import Qwen2MoeNonTransformerContainer, Qwen2MoeTransformerContainer
+from .model import Qwen2MoeInferenceModel
+
+
+class Qwen2MoePolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Qwen2MoeInferenceModel:
+        return Qwen2MoeInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Qwen2MoeTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Qwen2MoeNonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py b/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
index 38c0000d7f781..bd90cbd5d6978 100644
--- a/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
+++ b/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
@@ -42,7 +42,7 @@ def supports_config(config: DSMoEConfig) -> bool:
         if config.input_dtype != torch.float16 and config.input_dtype != torch.bfloat16:
             return False
 
-        if config.top_k != 1 and config.top_k != 2:
+        if config.top_k != 1 and config.top_k != 2 and config.top_k != 4:
             return False
 
         return True
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 0ca89dd0497a9..f140d73ee0d0c 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -117,6 +117,12 @@ def parse_args(args=None):
                         help="(optional) IP address of node 0, will be "
                         "inferred via 'hostname -I' if not specified.")
 
+    parser.add_argument("--node_rank",
+                        default=-1,
+                        type=int,
+                        help="ID of each node in the range [0:N). "
+                        "Only required when --no_ssh is set.")
+
     parser.add_argument("--launcher",
                         default=PDSH_LAUNCHER,
                         type=str,
@@ -145,6 +151,10 @@ def parse_args(args=None):
                         help="Do not pass local_rank as an argument when calling "
                         "the user's training script.")
 
+    parser.add_argument("--no_ssh",
+                        action="store_true",
+                        help="Launch training independently on each node without ssh setup.")
+
     parser.add_argument("--no_ssh_check",
                         action="store_true",
                         help="Do not perform ssh check in multi-node launcher model")
@@ -393,18 +403,19 @@ def main(args=None):
 
     resource_pool = fetch_hostfile(args.hostfile)
 
-    # respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
-    if not resource_pool and len(cuda_visible_devices):
-        detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
+    # respect VISIBLE_DEVICES for a single node and no explicit resource filters
+    visible_devices_env = get_accelerator().visible_devices_envs()[0]
+    visible_devices = os.environ.get(visible_devices_env, "")
+    if not resource_pool and len(visible_devices):
+        detected_str = f"Detected VISIBLE_DEVICES={visible_devices}"
         if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
             print(
                 f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
             )
         else:
-            args.include = f"localhost:{cuda_visible_devices}"
+            args.include = f"localhost:{visible_devices}"
             print(f"{detected_str}: setting --include={args.include}")
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        del os.environ[visible_devices_env]
 
     if args.num_nodes >= 0 or args.num_gpus >= 0:
         if args.include != "" or args.exclude != "":
@@ -427,7 +438,7 @@ def main(args=None):
     env = os.environ.copy()
 
     # validate that passwordless-ssh is workly properly with this hostfile
-    if multi_node_exec and not args.no_ssh_check:
+    if multi_node_exec and not args.no_ssh_check and not args.no_ssh:
         first_host = list(active_resources.keys())[0]
         try:
             ssh_check_cmd = "ssh -o PasswordAuthentication=no "
@@ -483,16 +494,22 @@ def main(args=None):
     if args.elastic_training:
         assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training"
 
+    if args.no_ssh:
+        assert (0 <= args.node_rank <
+                len(active_resources)), "Launching training without ssh, but --node_rank is not set correctly."
+
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = args.force_multi or len(active_resources) > 1
+    multi_node_exec = (args.force_multi or len(active_resources) > 1) and not args.no_ssh
 
     if not multi_node_exec:
         deepspeed_launch = [
             sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}",
             f"--master_addr={args.master_addr}", f"--master_port={args.master_port}"
         ]
+        if args.no_ssh:
+            deepspeed_launch.append(f"--node_rank={args.node_rank}")
         if args.no_python:
             deepspeed_launch.append("--no_python")
         if args.module:
diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py
index a27f1c3eaee7c..9931a95a0a40c 100644
--- a/deepspeed/linear/__init__.py
+++ b/deepspeed/linear/__init__.py
@@ -5,3 +5,4 @@
 
 from .optimized_linear import OptimizedLinear
 from .config import LoRAConfig, QuantizationConfig
+from .context_manager import Init, init_lora
diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py
index ae9050a3c92b0..2632ce7de9c43 100644
--- a/deepspeed/linear/config.py
+++ b/deepspeed/linear/config.py
@@ -3,7 +3,8 @@
 
 # DeepSpeed Team
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import List
 
 
 @dataclass
@@ -17,10 +18,19 @@ class LoRAConfig:
         base_weight_sharding (int): The degree to which the base weights are sharded,
             should typically be set to the data-parallel world size to maximize the memory
             reduction benefits. Defaults to 1, which means this feature is disabled.
+        offload (bool): offload frozen parameters to cpu when not in use
+        offload_ratio (float): ratio of parameters to offload to cpu when not in use
+        delay_lora_init (bool): initialize lora parameters at time of model init or allow manual init later
+        target_mods (str): target module names to apply LoRA to, defaults to llama-3.1 arch
     """
     lora_r: int = 64
     lora_alpha: float = 16.
     base_weight_sharding: int = 1
+    offload: bool = False
+    offload_ratio: float = 0.0
+    delay_lora_init: bool = False
+    target_mods: List[str] = field(
+        default_factory=lambda: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])
 
 
 @dataclass
diff --git a/deepspeed/linear/context_manager.py b/deepspeed/linear/context_manager.py
new file mode 100644
index 0000000000000..204fa0fe9c1da
--- /dev/null
+++ b/deepspeed/linear/context_manager.py
@@ -0,0 +1,90 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .optimized_linear import LoRAOptimizedLinear, OptimizedLinear
+
+import torch
+
+try:
+    import transformers
+except ImportError:
+    transformers = None
+
+
+def init_lora(model):
+    model.requires_grad_(False)
+    for m in model.modules():
+        if isinstance(m, LoRAOptimizedLinear):
+            m.init_lora()
+
+
+class Init(object):
+    """
+    Init context wrapper similar in style to zero.Init. Allows for injecting OptimizedLinear during model
+    construction which will shard base weights and reduce overall memory usage during model init. Primarily
+    useful when initializing a model via transformers.AutoModelForCausalLM.
+
+    Example usage:
+        lora_config = deepspeed.linear.LoRAConfig(..)
+        quant_config = deepspeed.linear.QuantizationConfig(..)
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
+
+    """
+
+    def __init__(self, lora_config=None, quant_config=None):
+        self._orig_nn_linear = torch.nn.Linear
+        self._orig_causallm_pretrained = None
+        if transformers != None:
+            self._orig_causallm_pretrained = transformers.AutoModelForCausalLM.from_pretrained
+            self._orig_causallm_config = transformers.AutoModelForCausalLM.from_config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+        self._post_init_complete = False
+
+    def __enter__(self):
+
+        class OptLinearWrapper:
+            _orig_nn_linear = self._orig_nn_linear
+            _lora_config = self.lora_config
+            _quant_config = self.quant_config
+
+            def __new__(self, *args, **kwargs):
+                self._lora_config.delay_lora_init = True
+                kwargs['lora_config'] = self._lora_config
+                kwargs['quantization_config'] = self._quant_config
+                kwargs['linear_cls'] = self._orig_nn_linear
+                return OptimizedLinear(*args, **kwargs)
+
+        def _model_init(model):
+            if self.lora_config != None:
+                init_lora(model)
+            self._post_init_complete = True
+            return model
+
+        # ensures non-lora params are frozen and lora weights are initialized
+        def from_pretrained(*args, **kwargs):
+            model = self._orig_causallm_pretrained(*args, **kwargs)
+            return _model_init(model)
+
+        def from_config(*args, **kwargs):
+            model = self._orig_causallm_config(*args, **kwargs)
+            return _model_init(model)
+
+        torch.nn.Linear = OptLinearWrapper
+        if transformers != None:
+            transformers.AutoModelForCausalLM.from_pretrained = from_pretrained
+            transformers.AutoModelForCausalLM.from_config = from_config
+
+    def __exit__(self, *args, **kwargs):
+        torch.nn.Linear = self._orig_nn_linear
+        if not self._post_init_complete:
+            print('WARNING: For some reason LoRA modules are not initialized, this is usually done automatically '
+                  'if using transformers via (AutoModelForCausalLM from_pretrained/from_config). '
+                  'You must call `init_lora` on each module in order to use DeepSpeed LoRA, otherwise '
+                  'you will error out during runtime.')
+        else:
+            transformers.AutoModelForCausalLM.from_pretrained = self._orig_causallm_pretrained
+            transformers.AutoModelForCausalLM.from_config = self._orig_causallm_config
diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py
index e982785a81220..3720196aa2554 100644
--- a/deepspeed/linear/optimized_linear.py
+++ b/deepspeed/linear/optimized_linear.py
@@ -40,7 +40,9 @@ def __new__(self,
                 bias: bool = False,
                 lora_config: LoRAConfig = None,
                 quantization_config: QuantizationConfig = None,
-                dtype=torch.bfloat16):
+                device=None,
+                dtype=torch.bfloat16,
+                linear_cls=nn.Linear):
 
         if quantization_config is not None and not is_dataclass(quantization_config):
             raise ValueError(f"Expecting QuantizationConfig but received {type(quantization_config)}")
@@ -48,7 +50,7 @@ def __new__(self,
             raise ValueError(f"Expecting LoRAConfig but received {type(lora_config)}")
         if lora_config is None and quantization_config is None:
             # Everything disabled, fall back to normal nn.Linear
-            self = nn.Linear(input_dim, output_dim, bias=bias, dtype=dtype)
+            self = linear_cls(input_dim, output_dim, bias=bias, dtype=dtype, device=device)
 
         elif lora_config:
             # lora enabled, quantization may or may not be
@@ -57,7 +59,9 @@ def __new__(self,
                                        bias=bias,
                                        lora_config=lora_config,
                                        quantization_config=quantization_config,
-                                       dtype=dtype)
+                                       dtype=dtype,
+                                       device=device,
+                                       linear_cls=linear_cls)
 
         elif quantization_config:
             # only quantization enabled, no lora
@@ -78,57 +82,121 @@ def __init__(self,
                  lora_config: LoRAConfig = None,
                  quantization_config: QuantizationConfig = None,
                  device=None,
-                 dtype=torch.bfloat16):
+                 dtype=torch.bfloat16,
+                 linear_cls=nn.Linear):
         super().__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.bias = bias
         self.lora_config = lora_config
         self.quantization_config = quantization_config
-        device = get_accelerator().current_device_name() if device is None else device
+        self.device = get_accelerator().current_device_name() if device is None else device
+        self.linear_cls = linear_cls
+        self.dtype = dtype
         assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config"
-
+        assert not self.bias, "bias=True is not supported by LoRAOptimizedLinear"
         self.zero_shards = self.lora_config.base_weight_sharding
         self.sharded_weight_size = int(float(self.input_dim) // self.zero_shards)
-        w = torch.nn.Parameter(torch.empty((self.output_dim, self.sharded_weight_size), dtype=dtype))
-        torch.nn.init.xavier_uniform_(w)
+        if self.zero_shards > 1:
+            assert self.zero_shards == dist.get_world_size(
+            ), "base weight sharding is only supported across world size"
+            w = torch.nn.Parameter(torch.empty(self.output_dim * self.sharded_weight_size, dtype=dtype),
+                                   requires_grad=False)
+        else:
+            w = torch.nn.Parameter(torch.empty((self.output_dim, self.input_dim), dtype=dtype), requires_grad=False)
+        torch.nn.init.xavier_uniform_(w.reshape(self.sharded_weight_size, self.output_dim))
 
         if self.quantization_config is not None:
             assert dtype == torch.bfloat16, "only bfloat16 is supported when using quantization"
-            self.base_weight = QuantizedParameter(w, quantization_config=quantization_config)
+            self.weight = QuantizedParameter(w, quantization_config=quantization_config)
         else:
-            self.base_weight = w
+            self.weight = w
+
+        self.disabled = False
+        self._initialized = False
+        if not self.lora_config.delay_lora_init:
+            self.init_lora()
+
+    def disable(self):
+        self.disabled = True
+        self.weight = torch.nn.Parameter(torch.empty((self.output_dim, self.input_dim), dtype=self.dtype),
+                                         requires_grad=False)
+
+    def init_lora(self):
+        if self.disabled:
+            return
+
+        if self.quantization_config is not None:
+            # ensure quant-param wasn't stripped, in some cases transformers will do this during model init
+            if not isinstance(self.weight, QuantizedParameter):
+                self.weight = QuantizedParameter(self.weight, quantization_config=self.quantization_config)
+
+        self._initialized = True
+        self.weight.requires_grad = False
 
-        self.base_weight.requires_grad = False
+        # Mark base weight to prevent broadcast and ensure proper offload behavior
+        self.weight.ds_optim_param = True
+
+        self.lora_scaling_factor = self.lora_config.lora_alpha / self.lora_config.lora_r
 
-        # Use RS lora for now.
-        self.lora_scaling_factor = self.lora_config.lora_alpha / math.sqrt(self.lora_config.lora_r)
         # Keeping lora weights in bf16 precision for ease of training.
-        self.lora_weight_1 = nn.Linear(self.input_dim,
-                                       self.lora_config.lora_r,
-                                       bias=self.bias,
-                                       device=device,
-                                       dtype=dtype)
-        self.lora_weight_2 = nn.Linear(self.lora_config.lora_r,
-                                       self.output_dim,
-                                       bias=self.bias,
-                                       device=device,
-                                       dtype=dtype)
+        self.lora_weight_1 = self.linear_cls(self.input_dim,
+                                             self.lora_config.lora_r,
+                                             bias=self.bias,
+                                             device=self.device,
+                                             dtype=self.dtype)
+        self.lora_weight_2 = self.linear_cls(self.lora_config.lora_r,
+                                             self.output_dim,
+                                             bias=self.bias,
+                                             device=self.device,
+                                             dtype=self.dtype)
+
+        # initialize "A" with kaiming uniform and "B" with zeros following this
+        # https://github.com/huggingface/peft/blob/62122b5add8d6892f70c82eaef2147a6ba33b90b/src/peft/tuners/lora/layer.py#L155
+        nn.init.kaiming_uniform_(self.lora_weight_1.weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_weight_2.weight)
         self.lora_weight_1.weight.requires_grad = True
         self.lora_weight_2.weight.requires_grad = True
 
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                              error_msgs):
+        if not any([target in prefix for target in self.lora_config.target_mods]):
+            # module does not match any target_mods, we must revert to normal nn.Linear via disable
+            self.disable()
+            return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys,
+                                                 unexpected_keys, error_msgs)
+
+        if self.zero_shards > 1:
+            if not dist.is_initialized():
+                raise RuntimeError(
+                    "attempting to use optimized linear base weight sharding but torch-distributed is not initialized, please init first."
+                )
+            rank = dist.get_rank()
+            shape_local = self.output_dim * self.sharded_weight_size
+            base_weight_name = f"{prefix}weight"
+            incoming_param = state_dict[base_weight_name]
+            state_dict[base_weight_name] = incoming_param.flatten().narrow(0, rank * shape_local, shape_local)
+
+        return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                                             error_msgs)
+
     def full_weight(self):
-        # This assumes weights are evenly sharded across gpus. which might not be correct.
-        # in that case, we should flatten before all_gather.
-        local_weight = self.base_weight.dequantized() if isinstance(self.base_weight,
-                                                                    QuantizedParameter) else self.base_weight
-        tensor_list = [
-            torch.zeros_like(local_weight, device=local_weight.device, dtype=local_weight.dtype)
-            for _ in range(self.zero_shards)
-        ]
-        dist.all_gather(tensor_list, local_weight)
-        weight = nn.Parameter(torch.cat([tensor for tensor in tensor_list], dim=1))
-        return weight
+        base_weight = self.weight
+        if getattr(base_weight, 'ds_offload', False):
+            # move to gpu so we can dequant and all-gather
+            assert base_weight.device == torch.device('cpu'), \
+                f"expected base weight on cpu but found {base_weight.device}"
+            base_weight.offload(revert=True)
+            local_weight = base_weight.dequantized() if isinstance(base_weight, QuantizedParameter) else base_weight
+            base_weight.offload()
+        else:
+            local_weight = base_weight.dequantized() if isinstance(base_weight, QuantizedParameter) else base_weight
+
+        tensor_out = torch.empty(self.output_dim * self.input_dim,
+                                 dtype=local_weight.dtype,
+                                 device=local_weight.device)
+        dist.all_gather_into_tensor(tensor_out, local_weight)
+        return tensor_out.reshape(self.output_dim, self.input_dim)
 
     def linear_without_F_linear(self, input, weight):
         output = torch.mm(input.reshape(-1, input.shape[-1]), weight)
@@ -136,14 +204,18 @@ def linear_without_F_linear(self, input, weight):
         return output
 
     def forward(self, input_tensor):
+        if self.disabled:
+            return F.linear(input_tensor, self.weight)
+        assert self._initialized, "init_lora was never called, please initialize before proceeding"
+
         # Gather the sharded base weight
         if self.zero_shards > 1:
             with torch.no_grad():
                 base_weight = self.full_weight()
         elif self.quantization_config:
-            base_weight = self.base_weight.dequantized()
+            base_weight = self.weight.dequantized()
         else:
-            base_weight = self.base_weight
+            base_weight = self.weight
 
         base_weight_output = F.linear(input_tensor, base_weight)
         lora_output = self.lora_weight_2(self.lora_weight_1(input_tensor))
diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py
index f5343af45fb89..70fabea845baa 100644
--- a/deepspeed/linear/quantization.py
+++ b/deepspeed/linear/quantization.py
@@ -57,24 +57,31 @@ def __new__(
 
     def _ensure_quantized(self, tensor: torch.Tensor):
         # If the tensor is on the accelerator and is not quantized, then quantize it in-place.
-        if get_accelerator().on_accelerator(tensor) and tensor.dtype != torch.int8:
+        if get_accelerator().on_accelerator(tensor) and tensor.dtype != torch.uint8:
             with get_accelerator().stream(get_accelerator().current_stream(tensor.device)):
                 tensor.data = self.quantizer.quantize(tensor.data,
                                                       q_bits=self.quantization_config.q_bits,
                                                       q_mantisa_bits=self.quantization_config.mantissa_bits)
-            assert tensor.dtype == torch.int8
+            assert tensor.dtype == torch.uint8
 
     def dequantized(self) -> torch.Tensor:
         """
         Return a tensor containing the dequantized weights of this parameter.
         """
-        if get_accelerator().on_accelerator(self.data) and self.data.dtype == torch.int8:
+        if get_accelerator().on_accelerator(self.data) and self.data.dtype == torch.uint8:
             with get_accelerator().stream(get_accelerator().current_stream(self.data.device)):
                 return self.quantizer.dequantize(self.data,
                                                  q_bits=self.quantization_config.q_bits,
                                                  q_mantisa_bits=self.quantization_config.mantissa_bits)
         return self.data
 
+    def offload(self, revert=False):
+        if getattr(self, 'ds_offload', False):
+            if revert:
+                self.data = self.to(get_accelerator().current_device_name())
+            else:
+                self.data = self.to('cpu')
+
     def __getstate__(self):
         state = self.__dict__
         state["data"] = self.data
@@ -104,7 +111,9 @@ def __copy__(self):
         return new_instance
 
     def cuda(self, device=None, non_blocking=False):
-        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
+        device = "cuda" if device is None else device
+        self.quantizer.to(device, non_blocking=non_blocking)
+        return self.to(device, non_blocking=non_blocking)
 
     def to(self, *args, **kwargs):
         """
@@ -112,6 +121,7 @@ def to(self, *args, **kwargs):
         quantize it.
         """
         tensor = super().to(*args, **kwargs)
+        self.quantizer.to(*args, **kwargs)
         self._ensure_quantized(tensor)
         return tensor
 
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index 6f4c170d8295c..e6eea2183de5a 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -134,7 +134,7 @@ def is_load_module(module):
         load_layer_names = [
             "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "FalconLinear",
             "MistralRMSNorm", "T5LayerNorm", "MixtralRMSNorm", "Phi3RotaryEmbedding", "Phi3SuScaledRotaryEmbedding",
-            "Phi3RMSNorm", "YuanRMSNorm", "YuanRotaryEmbedding", "Phi3LongRoPEScaledRotaryEmbedding"
+            "Phi3RMSNorm", "YuanRMSNorm", "YuanRotaryEmbedding", "Phi3LongRoPEScaledRotaryEmbedding", "Qwen2RMSNorm"
         ]
         return module.__class__ in load_layers or module._get_name() in load_layer_names
 
@@ -309,6 +309,10 @@ def tp_parser(model):
                     gem_list = gem_list + [layer]
                 elif 'self_attn.dense' in layer and 'Phi' in str(type(module)):
                     gem_list = gem_list + [layer]
+                elif 'self_attention.dense' in layer and 'ChatGLM' in str(model):
+                    gem_list = gem_list + [layer]
+                elif 'dense_4h_to_h' in layer and 'ChatGLM' in str(model):
+                    gem_list = gem_list + [layer]
 
             layer_list = []
             if gem_list != []:
@@ -341,8 +345,8 @@ def _replace(self, child, name, conv_linear_layer):
                 weight, bias = shard_value_with_share_qk(child.weight.data, child.bias, dist.get_rank(),
                                                          dist.get_world_size(), False)
                 return LinearAllreduce(weight, bias, self.mp_group)
-        # for phi3.
-        if 'gate_up_proj' in name:
+        # For MLP including chunk layer.
+        if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
             weight, bias = shard_chunk_mlp(child.weight.data, child.bias, dist.get_rank(), dist.get_world_size())
             return LinearLayer(weight=weight, bias=bias)
         if name in self.all_reduce_linears:
@@ -424,7 +428,8 @@ def update_mp_params(self, child):
             return
         param_list = [
             "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads", "all_head_size",
-            "embed_dim", "hidden_size", "num_key_value_heads", "num_kv_heads", "kv_n_heads", "d_model"
+            "embed_dim", "hidden_size", "num_key_value_heads", "num_kv_heads", "kv_n_heads", "d_model",
+            "num_attention_heads_per_partition", "num_multi_query_groups_per_partition", "hidden_size_per_partition"
         ]
         for param in param_list:
             if "Yuan" in str(child) and 'embed_dim' in param_list:
@@ -488,7 +493,10 @@ def _replace_module(self, r_module, prev_name='', prev_class_name=''):
 
     def get_model_num_kv_heads(self, config):
         num_kv_heads = None
-        kv_head_names = ['num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads']
+        # multi_query_group_num is for chatglm2 & chatglm3
+        kv_head_names = [
+            'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads'
+        ]
         for name in kv_head_names:
             if hasattr(config, name):
                 num_kv_heads = getattr(config, name)
diff --git a/deepspeed/module_inject/containers/llama.py b/deepspeed/module_inject/containers/llama.py
index f6157e5cdfed8..7af333dc1ee47 100644
--- a/deepspeed/module_inject/containers/llama.py
+++ b/deepspeed/module_inject/containers/llama.py
@@ -129,12 +129,10 @@ def __init__(self, client_module, inference=True):
 
     def get_hidden_heads(self):
         hidden_heads = (
-            getattr(self.client_module.self_attn.q_proj.weight, "ds_shape",
-                    self.client_module.self_attn.q_proj.weight.shape)[1],
+            self.client_module.self_attn.q_proj.in_features,
             self.client_module.self_attn.num_heads,
             self.client_module.input_layernorm.variance_epsilon,
-            getattr(self.client_module.mlp.gate_proj.weight, "ds_shape",
-                    self.client_module.mlp.gate_proj.weight.shape)[0],
+            self.client_module.mlp.gate_proj.out_features,
         )
         return hidden_heads
 
diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py
index f63cbee44d5cd..0609c6001dd2f 100644
--- a/deepspeed/module_inject/fusedqkv_utils.py
+++ b/deepspeed/module_inject/fusedqkv_utils.py
@@ -67,11 +67,24 @@ def _codegen_type_transpose(input, mp_size, codegen_mp_num=4):
     def _glm_type_transpose(input, mp_size):
         #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias)
 
-        shape = input.shape
-        src_split = torch.split(input, shape[0] // 3, dim=0)
+        # For chatglm2 & chatglm3(kv_heads=2), need to special handle.
+        if get_num_kv_heads() == 2:
+            shape = input.shape
+            hidden_dim = get_n_embd()
+            kv_dim = (shape[0] - hidden_dim) // get_num_kv_heads()
+            q = input[:hidden_dim]
+            k = input[hidden_dim:hidden_dim + kv_dim]
+            v = input[hidden_dim + kv_dim:]
+            q_split = q.split(get_shard_size_list(q.shape[0], mp_size), dim=0)
+            k_split = k.split(get_shard_size_list(k.shape[0], mp_size), dim=0)
+            v_split = v.split(get_shard_size_list(v.shape[0], mp_size), dim=0)
+            return torch.cat((q_split[gpu_index], k_split[gpu_index], v_split[gpu_index]), dim=0)
+        else:
+            shape = input.shape
+            src_split = torch.split(input, shape[0] // 3, dim=0)
 
-        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size))
-        return split_fusedqkv[gpu_index]
+            split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size))
+            return split_fusedqkv[gpu_index]
 
     def _bloom_type_transpose(input, mp_size):
         shape = input.shape
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 85abd0217039d..64dc5479940c3 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -281,7 +281,7 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
 
         # 4.1 Get n_embd
         n_embd = None
-        multi_query_n_embd_names = ['n_embd']
+        multi_query_n_embd_names = ['n_embd', 'hidden_size']
         for name in multi_query_n_embd_names:
             if hasattr(model_config, name):
                 n_embd = getattr(model_config, name)
diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py
index b8a06274343a1..e57f66b85193d 100644
--- a/deepspeed/moe/mappings.py
+++ b/deepspeed/moe/mappings.py
@@ -32,15 +32,23 @@ def _gather_tokens(input_, dim=0):
     mpu = deepspeed.utils.groups.mpu
 
     input_ = input_.contiguous()
-    # Size and dimension.
-    rank = bwc_tensor_model_parallel_rank(mpu)
-
-    tensor_list = [torch.empty_like(input_) for _ in range(bwc_tensor_model_parallel_world_size(mpu))]
-    tensor_list[rank] = input_
-    deepspeed.comm.all_gather(tensor_list, input_, group=bwc_tensor_model_parallel_group(mpu))
+    world_size = bwc_tensor_model_parallel_world_size(mpu)
+    if world_size == 1:
+        return input_
 
-    # Note: torch.cat already creates a contiguous tensor.
-    output = torch.cat(tensor_list, dim=dim).contiguous()
+    gather_buffer = torch.empty(world_size * input_.numel(), dtype=input_.dtype, device=input_.device)
+    deepspeed.comm.all_gather_into_tensor(gather_buffer, input_, group=bwc_tensor_model_parallel_group(mpu))
+    if dim == 0:
+        shape = list(input_.size())
+        shape[0] = shape[0] * world_size
+        output = gather_buffer.view(shape)
+    else:
+        tensor_list = [
+            gather_buffer.narrow(0,
+                                 input_.numel() * i, input_.numel()).view_as(input_) for i in range(world_size)
+        ]
+        # Note: torch.cat already creates a contiguous tensor.
+        output = torch.cat(tensor_list, dim=dim).contiguous()
 
     return output
 
@@ -50,6 +58,8 @@ def _drop_tokens(input_, dim=0):
     mpu = deepspeed.utils.groups.mpu
 
     total_chunks = bwc_tensor_model_parallel_world_size(mpu)
+    if total_chunks == 1:
+        return input_
     this_chunk = bwc_tensor_model_parallel_rank(mpu)
     assert input_.shape[
         dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index 96eab5e2ab17e..c09a11e213db0 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -124,6 +124,8 @@ def einsum(rule, a, b):
         return a.unsqueeze(2) * b.unsqueeze(1)
     elif rule == 'se,se->s':
         return torch.bmm(a.unsqueeze(1), b.unsqueeze(2)).reshape(-1)
+    elif rule == 'se,sec->sec':
+        return a.unsqueeze(2) * b
     elif rule == 'sec,sm->ecm':
         s = a.shape[0]
         e = a.shape[1]
@@ -191,8 +193,8 @@ def top1gating(logits: Tensor,
     if noisy_gate_policy == 'RSample':
         logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
     # everything is in fp32 in this function
-    gates = F.softmax(logits, dim=1)
 
+    gates = F.softmax(logits, dim=1)
     capacity = _capacity(gates, torch.tensor(capacity_factor), torch.tensor(min_capacity))
 
     # Create a mask for 1st's expert per token
@@ -369,6 +371,81 @@ def top2gating(logits: Tensor,
     return l_aux, combine_weights, dispatch_mask, exp_counts.detach().to('cpu')
 
 
+def topkgating(
+    logits: Tensor,
+    k: int,
+    capacity_factor: float,
+    min_capacity: int,
+    drop_tokens: bool = True,
+    ep_group: Union[torch.distributed.ProcessGroup, None] = None,
+    drop_policy: str = "probs",
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Implements TopKGating on logits."""
+
+    # everything is in fp32 in this function
+    # get topk gates
+    top_gate, top_idx = torch.topk(logits, k=k, dim=1)
+    # gating decisions
+    gates = F.softmax(logits, dim=1)
+    num_experts = int(gates.shape[1])
+
+    # get topk mask
+    topk_masked_gates = torch.zeros_like(logits).scatter(1, top_idx, top_gate)
+
+    mask = torch.zeros_like(gates, dtype=torch.bool).scatter_(1, top_idx, 1)
+
+    exp_counts = torch.sum(mask, dim=0).detach().to(logits.device)
+
+    # Compute l_aux
+    me = torch.mean(gates, dim=0)
+    ce = torch.mean(mask.float(), dim=0)
+    l_aux = torch.mean(me * ce) * num_experts * num_experts / k
+
+    if drop_tokens:
+        # Calculate configured capacity and remove locations outside capacity from mask
+        capacity = _capacity(gates, torch.tensor(capacity_factor * k), torch.tensor(min_capacity))
+        # update mask and locations by capacity
+
+        if drop_policy == 'probs':
+            capacity_probs, capacity_indices = torch.topk(topk_masked_gates, k=capacity, dim=0, sorted=False)
+            capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1)
+            mask = torch.logical_and(mask, capacity_mask)
+            locations = torch.cumsum(mask, dim=0) - 1
+
+        elif drop_policy == "position":
+            locations = torch.cumsum(mask, dim=0) - 1
+            mask *= torch.lt(locations, capacity)
+        else:
+            raise ValueError(f"Invalid drop_policy: {drop_policy}")
+
+    else:
+        # Do not drop tokens - set capacity according to current expert assignments
+        new_capacity = torch.max(exp_counts)
+        if ep_group is not None:
+            dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=ep_group)
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'.
+            # This is since we are going to activate drop_tokens() to drop duplicate tokens.
+            tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu)
+            new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
+        capacity = new_capacity
+
+    # normalize gates
+    gates_masked = gates * mask
+    gates_s = torch.sum(gates_masked, dim=-1, keepdim=True)
+    denom_s = torch.clamp(gates_s, min=torch.finfo(gates_masked.dtype).eps)
+    gates_masked = gates_masked / denom_s
+
+    # dispatch_mask
+    locations_sc = _one_hot_to_float((locations * mask), capacity)
+
+    combine_weights = torch.einsum("se,sec->sec", gates_masked, locations_sc)
+
+    dispatch_mask = combine_weights.bool()
+
+    return l_aux, combine_weights, dispatch_mask, exp_counts
+
+
 class TopKGate(Module):
     """Gate module which implements Top2Gating as described in Gshard_.
     ::
@@ -401,9 +478,6 @@ def __init__(self,
                  top2_2nd_expert_sampling: bool = True) -> None:
         super().__init__()
 
-        # Only top-1 and top-2 are supported at the moment.
-        if k != 1 and k != 2:
-            raise ValueError('Only top-1 and top-2 gatings are supported.')
         self.wg = torch.nn.Linear(model_dim, num_experts, bias=False)
         self.ep_group = ep_group
         self.k = k
@@ -441,9 +515,13 @@ def forward(self,
                                      self.min_capacity, used_token, self.noisy_gate_policy if self.training else None,
                                      self.drop_tokens, self.use_rts, self.ep_group, use_tutel)
 
-        else:
+        elif self.k == 2:
             gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
                                      self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling)
+        else:
+            gate_output = topkgating(logits, self.k,
+                                     self.capacity_factor if self.training else self.eval_capacity_factor,
+                                     self.min_capacity, self.drop_tokens, self.ep_group)
 
         if self.wall_clock_breakdown:
             self.timers(TOPK_GATE_TIMER).stop()
@@ -533,13 +611,18 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
         if self.wall_clock_breakdown:
             self.timers(FIRST_ALLTOALL_TIMER).start()
 
-        if groups._get_expert_model_parallel_world_size() == 1:
-            # If the non-expert is tensor-parallel, it will create
+        tensor_model_world_size = bwc_tensor_model_parallel_world_size(groups.mpu)
+        if tensor_model_world_size > 1:
+            # If the non-expert is tensor-parallel,
+            # Whether expert is tensor-parallel or not , it will create
             # duplicate tokens on the tensor-parallel ranks.
-            # Since our experts are not tensor-parallel, these duplicates
-            # need to be dropped to ensure correctness.
-            # this also doubles up as a communication optimization as we are
-            # reducing the all-to-all communication volume.
+            # drop duplicate tokens also doubles up as a communication
+            # optimization as we are reducing the all-to-all communication volume.
+            # 1: for not tensor-parallel expert,drop duplicate tokens to ensure
+            # both correctness and reduce all-to-all communication.
+            # 2: for tensor-parallel expert,drop duplicate tokens to reduce all-to-all
+            # communication volume,before expert execution, it is necessary to perform
+            # an allgather to ensure correctness,
             dispatched_input = drop_tokens(dispatched_input, dim=1)
 
         dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)
@@ -548,10 +631,22 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
             self.timers(FIRST_ALLTOALL_TIMER).stop()
             self.time_falltoall = self.timers(FIRST_ALLTOALL_TIMER).elapsed(reset=False)
 
+        if tensor_model_world_size > 1 and groups._get_expert_model_parallel_world_size() > 1:
+            # if both expert and non-expert are tensor-parallel
+            # the dropped duplicate tokens need to be gathered on each
+            # tensor parallel rank again to ensure correctness
+            dispatched_input = gather_tokens(dispatched_input, dim=1)
+
         # Re-shape after all-to-all: ecm -> gecm
         dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model)
-
         expert_output = self.experts(dispatched_input)
+        # Re-shape before drop_tokens: gecm -> ecm
+        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
+        if tensor_model_world_size > 1 and groups._get_expert_model_parallel_world_size() > 1:
+            # if both expert and non-expert are tensor-parallel
+            # drop duplicate tokens to ensure both correctness
+            # and reduce all-to-all communication.
+            expert_output = drop_tokens(expert_output, dim=1)
 
         if self.wall_clock_breakdown:
             self.timers(SECOND_ALLTOALL_TIMER).start()
@@ -562,10 +657,7 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
             self.timers(SECOND_ALLTOALL_TIMER).stop()
             self.time_salltoall = self.timers(SECOND_ALLTOALL_TIMER).elapsed(reset=False)
 
-        # Re-shape back: gecm -> ecm
-        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
-
-        if groups._get_expert_model_parallel_world_size() == 1:
+        if tensor_model_world_size > 1:
             # the dropped duplicate tokens need to be gathered on each
             # tensor parallel rank again for the tensor-parallel
             # non-expert of the next layer.
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index 7ea5ce5af19eb..15179984173c1 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -9,7 +9,7 @@
 from . import lion
 from . import sparse_attention
 from . import transformer
-
+from . import fp_quantizer
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 
 from ..git_version_info import compatible_ops as __compatible_ops__
diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py
index 995bbae4aeaf7..51377bc6092c0 100644
--- a/deepspeed/ops/fp_quantizer/__init__.py
+++ b/deepspeed/ops/fp_quantizer/__init__.py
@@ -4,3 +4,9 @@
 # DeepSpeed Team
 
 from .quantize import FP_Quantize, Quantizer
+
+try:
+    import triton
+    from .fp8_gemm import matmul_fp8
+except ImportError:
+    pass
diff --git a/deepspeed/ops/fp_quantizer/fp8_gemm.py b/deepspeed/ops/fp_quantizer/fp8_gemm.py
new file mode 100644
index 0000000000000..55504e3af8c95
--- /dev/null
+++ b/deepspeed/ops/fp_quantizer/fp8_gemm.py
@@ -0,0 +1,171 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+######## Fused MoE kernel #########
+# These kernels are implemented for
+# fusing GeMM with dequantization of
+# fp8 weight data when using bit-16
+# activation.
+###################################
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def matmul_kernel_fp8_bf16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
+                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+                           quantization_group_size: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
+        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
+
+    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
+    scale = tl.load(scale_ptr + weight_ptrs_offset)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        # Dequantize weight (fp8 -> bf16)
+        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 4)).to(tl.uint16)
+        w = (w + 0x3C00).to(tl.uint16)
+        w = (w.to(tl.bfloat16, bitcast=True) * scale).to(tl.bfloat16)
+
+        inp_data += BLOCK_SIZE_K * stride_ak
+        weight_data += BLOCK_SIZE_K * stride_bk
+        weight_mask = offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K
+        weight = tl.load(weight_data, mask=weight_mask, other=0.0)
+        scale = tl.load(scale_ptr + (weight_ptrs_offset +
+                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)),
+                        mask=weight_mask,
+                        other=0.0)
+
+        accumulator += tl.dot(inp, w)
+
+    out = accumulator.to(tl.bfloat16)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
+
+
+@triton.jit
+def matmul_kernel_fp8_fp16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
+                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+                           quantization_group_size: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
+        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
+
+    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
+    scale = tl.load(scale_ptr + weight_ptrs_offset)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        # Dequantize weight (fp8 -> fp16)
+        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 7)).to(tl.uint16)
+        w = (w + 0x2000).to(tl.uint16)
+        w = (w.to(tl.float16, bitcast=True) * scale).to(tl.float16)
+
+        inp_data += BLOCK_SIZE_K * stride_ak
+        weight_data += BLOCK_SIZE_K * stride_bk
+
+        weight = tl.load(weight_data, mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0)
+        scale = tl.load(scale_ptr + (weight_ptrs_offset +
+                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)))
+
+        accumulator += tl.dot(inp, w)
+
+    out = accumulator.to(tl.float16)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
+
+
+def matmul_fp8(inp, weight, scale, quantization_group_size):
+
+    assert inp.shape[1] == weight.shape[0], \
+        f"Incompatible dimensions (input: {inp.shape}, weight: {weight.shape})"
+
+    M, K = inp.shape
+    K, N = weight.shape
+
+    out = torch.empty((M, N), device=inp.device, dtype=inp.dtype)
+
+    # GEMM tuning parameters!
+    # TODO: Add a more configurable tuning for selecting the best GeMM
+    BLOCK_SIZE_M = 16 if M <= 16 else 32 if M <= 32 else 64 if M <= 64 else 128
+    BLOCK_SIZE_N = 64
+    BLOCK_SIZE_K = max(64, quantization_group_size)
+    GROUP_SIZE_M = 8
+    num_stages = 4
+    num_warps = 4
+    if M >= 256:
+        BLOCK_SIZE_M = 256
+        BLOCK_SIZE_N = 128
+        BLOCK_SIZE_K = max(128, quantization_group_size)
+        num_stages = 3
+        num_warps = 8
+
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
+    kernel = matmul_kernel_fp8_bf16 if inp.dtype == torch.bfloat16 else matmul_kernel_fp8_fp16
+    kernel[grid](inp,
+                 weight,
+                 out,
+                 scale,
+                 M,
+                 N,
+                 K,
+                 inp.stride(0),
+                 inp.stride(1),
+                 weight.stride(0),
+                 weight.stride(1),
+                 out.stride(0),
+                 out.stride(1),
+                 quantization_group_size=quantization_group_size,
+                 BLOCK_SIZE_M=BLOCK_SIZE_M,
+                 BLOCK_SIZE_N=BLOCK_SIZE_N,
+                 BLOCK_SIZE_K=BLOCK_SIZE_K,
+                 GROUP_SIZE_M=GROUP_SIZE_M,
+                 num_stages=num_stages,
+                 num_warps=num_warps)
+    return out
diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py
index f8435bda16c17..edd4ef57302cc 100644
--- a/deepspeed/ops/fp_quantizer/quantize.py
+++ b/deepspeed/ops/fp_quantizer/quantize.py
@@ -7,7 +7,9 @@
 import abc
 from abc import ABC
 
+import gc
 from deepspeed.ops.op_builder import FPQuantizerBuilder
+from deepspeed.accelerator import get_accelerator
 
 fp_quant_module = None
 
@@ -71,15 +73,34 @@ def quantize(self,
         else:
             assert (0), \
                 f"Missing {q_bits}-quantization, please add the template arguments for the kernel to support this precision!"
-
-        out = fp_quant_module.quantize(input, self.group_size, stochastic_mode, q_bits, q_mantisa_bits)
-
+        self.num_groups = input.numel() // self.group_size
+        self.input_q = torch.ones(self.num_groups,
+                                  int(self.group_size * q_bits) // 8 + 4,
+                                  dtype=torch.uint8,
+                                  device=input.device)
+        out = fp_quant_module.quantize(self.input_q, input, self.group_size, stochastic_mode, q_bits, q_mantisa_bits)
         if return_meta_tensor:
-            data, scale = out.split(self.group_size, dim=-1)
-            return data.contiguous().reshape(input.shape), scale.contiguous()
+            data, self.scale = out.split(self.group_size, dim=-1)
+            data = data.contiguous().reshape(input.shape)
+            self.scale = self.scale.contiguous()
+            del self.input_q
+            del out
+            gc.collect()
+            get_accelerator().empty_cache()
+            return data, self.scale
 
         return out
 
+    def to(self, *args, **kwargs):
+        # Intermediate tensors may need to be moved to different devices
+        if hasattr(self, 'input_q'):
+            self.input_q = self.input_q.to(*args, **kwargs)
+        if hasattr(self, 'scale'):
+            self.scale = self.scale.to(*args, **kwargs)
+
+    def get_scales(self):
+        return fp_quant_module.get_scales(self.scale, self.num_groups)
+
     def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor:
         assert (self.orig_dtype is not None), \
             "[De-quantization Error]: you need to call quantize before dequantizing!"
@@ -101,7 +122,6 @@ def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=Non
             assert input_q.numel() == fp_out.numel(), \
             f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
             input_q = torch.cat([input_q.reshape(-1, self.group_size), scale], dim=-1).contiguous()
-
         fp_quant_module.dequantize(fp_out, input_q, self.group_size, q_mantisa_bits, q_bits - q_mantisa_bits - 1)
         return fp_out
 
diff --git a/deepspeed/ops/gds/__init__.py b/deepspeed/ops/gds/__init__.py
new file mode 100755
index 0000000000000..3c0762c810766
--- /dev/null
+++ b/deepspeed/ops/gds/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..op_builder import GDSBuilder
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index 8766b65e866db..fc001a86d42e5 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -327,7 +327,7 @@ def forward(self,
 
             if self.expert_mp_group is not None:
                 world_size = dist.get_world_size(group=self.expert_mp_group)
-                gather_buffer = torch.zeros(world_size * attention_output.numel(),
+                gather_buffer = torch.empty(world_size * attention_output.numel(),
                                             dtype=attention_output.dtype,
                                             device=attention_output.device)
                 dist.all_gather_into_tensor(gather_buffer, attention_output, group=self.expert_mp_group)
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 529931ca0df12..f955cf5ebcad6 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -965,8 +965,9 @@ def after_backward_hook(_nonuse_grads):
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
         outputs = function(*inputs_cuda)
-    for leaf_tensor in leaf_tensors:
-        leaf_tensor.register_hook(after_backward_hook)
+    if PROFILE_TIME or SYNCHRONIZE:
+        for leaf_tensor in leaf_tensors:
+            leaf_tensor.register_hook(after_backward_hook)
 
     see_memory_usage("After running forward on the layer", force=False)
 
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index 965b446163ec4..325188f029317 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -540,6 +540,11 @@ def param_groups(self):
         """Forward the wrapped optimizer's parameters."""
         return self.optimizer.param_groups
 
+    @property
+    def state(self):
+        """Forward the wrapped optimizer's states."""
+        return self.optimizer.state
+
     def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx):
         assert self.immediate_grad_update
         self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=True)
diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py
index df63854dd1ca0..36300eb904ddc 100755
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@@ -7,6 +7,7 @@
 from deepspeed.utils import log_dist
 import numpy as np
 import logging
+from deepspeed.utils.torch import required_torch_version
 
 
 class Eigenvalue(object):
@@ -36,12 +37,15 @@ def __init__(self,
             ranks=[0])
 
     # Replace all nan/pos-inf/neg-inf to zero
-    # TODO: Pytorch new version may add this function, replace this one by then.
     def nan_to_num(self, x):
-        device = x.device
-        x = x.cpu().numpy()
-        x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
-        return torch.from_numpy(x).to(device)
+        if required_torch_version(min_version=1.8):
+            return torch.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
+        else:
+            # Fallback to numpy based implementation for backwards-compatibility with PyTorch 1.7 or older versions.
+            device = x.device
+            x = x.cpu().numpy()
+            x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
+            return torch.from_numpy(x).to(device)
 
     def normalize(self, v):
         norm_squared = self.inner_product(v, v)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index d40141132aaf3..27d294b3ae015 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -35,6 +35,8 @@
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
 
+from deepspeed.linear.optimized_linear import LoRAOptimizedLinear
+
 from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
     ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, \
@@ -304,7 +306,7 @@ def __init__(self,
 
         if has_optimizer:
             self._configure_optimizer(optimizer, model_parameters)
-            self._configure_lr_scheduler(lr_scheduler)
+            self._configure_lr_scheduler()
             self._report_progress(0)
         elif self.zero_optimization():
             # no optim selected but zero is enabled
@@ -326,6 +328,8 @@ def __init__(self,
                 self.sparse_tensor_module_names.add(name + ".weight")
                 logger.info("Will convert {} to sparse tensor during training".format(name))
 
+        self._optimized_linear_offload_setup()
+
         self.save_non_zero_checkpoint = False
         self.save_zero_checkpoint = False
         if not isinstance(self.optimizer, DeepSpeedZeRoOffload):
@@ -363,6 +367,43 @@ def __init__(self,
 
         self._is_compiled = False
 
+    def _optimized_linear_offload_setup(self):
+        self.optimized_linear_base_weight_sharding = False
+        self.optimized_linear_lora_enabled = False
+        offload_ratio = None
+        for _, module in self.module.named_modules():
+            if isinstance(module, LoRAOptimizedLinear):
+                self.optimized_linear_lora_enabled = True
+                offload_ratio = None
+                if offload_ratio is not None:
+                    assert offload_ratio == module.lora_config.offload_ratio, \
+                        "all lora_config offload ratios should be the same across the model"
+                offload_ratio = module.lora_config.offload_ratio
+                if module.zero_shards > 1:
+                    # set attr so checkpoint saving can handle BWS properly
+                    self.optimized_linear_base_weight_sharding = True
+
+        if offload_ratio is None:
+            # Nothing enabled, do nothing
+            return
+
+        total_params = 0
+        for _, p in self.module.named_parameters():
+            if hasattr(p, 'ds_optim_param'):
+                total_params += p.numel()
+
+        offload_limit = total_params * offload_ratio
+        logger.info(f'offloading {offload_ratio*100}% of eligible params, specifically {offload_limit} params')
+        total_offloaded = 0
+        for _, p in self.module.named_parameters():
+            if hasattr(p, 'ds_optim_param'):
+                if total_offloaded < offload_limit:
+                    total_offloaded += p.numel()
+                    p.ds_offload = True
+                    p.offload()
+                else:
+                    p.ds_offload = False
+
     def destroy(self):
         if self.optimizer is not None and hasattr(self.optimizer, 'destroy'):
             self.optimizer.destroy()
@@ -902,19 +943,19 @@ def _optimizer_has_ckpt_event_prologue(self):
     def _optimizer_has_ckpt_event_epilogue(self):
         return self.optimizer is not None and hasattr(self.optimizer, 'checkpoint_event_epilogue')
 
-    def _configure_lr_scheduler(self, client_lr_scheduler):
-        # First check for scheduler in json configuration
-        lr_scheduler = self._scheduler_from_config(self.optimizer)
-        if lr_scheduler:
-            log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
-            self.lr_scheduler = lr_scheduler
-        else:
-            if isinstance(client_lr_scheduler, Callable):
+    def _configure_lr_scheduler(self):
+        if self.client_lr_scheduler:
+            if isinstance(self.client_lr_scheduler, Callable):
                 log_dist('DeepSpeed using client callable to create LR scheduler', ranks=[0])
-                self.lr_scheduler = client_lr_scheduler(self.basic_optimizer)
+                self.lr_scheduler = self.client_lr_scheduler(self.basic_optimizer)
             else:
                 log_dist('DeepSpeed using client LR scheduler', ranks=[0])
-                self.lr_scheduler = client_lr_scheduler
+                self.lr_scheduler = self.client_lr_scheduler
+        else:
+            # load lr scheduler from json configuration if lr scheduler is not defined and passed in
+            lr_scheduler = self._scheduler_from_config(self.optimizer)
+            log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
+            self.lr_scheduler = lr_scheduler
 
         log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0])
 
@@ -968,13 +1009,13 @@ def _set_distributed_vars(self, args):
         device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank
         if device_rank >= 0:
             get_accelerator().set_device(device_rank)
-            self.device = torch.device(get_accelerator().device_name(), device_rank)
+            self.device = torch.device(get_accelerator().device_name(device_rank))
             self.world_size = dist.get_world_size()
             self.global_rank = dist.get_rank()
         else:
             self.world_size = 1
             self.global_rank = 0
-            self.device = torch.device(get_accelerator().device_name())
+            self.device = get_accelerator().device()
 
     # Configure based on command line arguments
     def _configure_with_arguments(self, args, mpu):
@@ -1054,9 +1095,12 @@ def _broadcast_model(self):
         def is_replicated(p):
             if hasattr(p, "ds_status") and p.ds_status is not ZeroParamStatus.AVAILABLE:
                 return False
+            elif hasattr(p, 'ds_optim_param'):
+                # do not broadcast OptimizedLinear parameters, they are unique per base weight shard
+                return False
             return True
 
-        for p in self.module.parameters():
+        for n, p in self.module.named_parameters():
             # Broadcast the model for different parameters
             if is_moe_param(p):
                 if torch.is_tensor(p) and is_replicated(p):
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 419dd63ec2b9a..73ff0dd0832f4 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -5,6 +5,8 @@
 
 from types import MethodType
 from collections import OrderedDict
+from functools import reduce
+from operator import mul
 
 import torch
 from deepspeed import comm as dist
@@ -40,6 +42,9 @@
 PIPE_RECV_INPUT_TIMER = 'pipe_recv_input'
 PIPE_RECV_GRAD_TIMER = 'pipe_recv_grad'
 
+# The buffer size to store the meta data for each tensor.
+TENSOR_META_SIZE = 256
+
 
 def is_even(number):
     return number % 2 == 0
@@ -179,6 +184,7 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         }
         self.pipe_recv_buf = None
         self.grad_layer = None
+        self._grad_layer_buf = []
 
         self.meta_buffer = None
 
@@ -213,6 +219,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
                 self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint
                 if self.grid.get_global_rank() == 0:
                     logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint')
+        if self.module.activation_checkpoint_interval > 0:
+            self.module._precompute_checkpointable_values()
 
         self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline
 
@@ -248,6 +256,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
             self.timers(STEP_MICRO_TIMER).start()
             self.timers(STEP_MICRO_TIMER).stop()
 
+        self.dynamic_shape = self.module.dynamic_shape
+
     def set_has_attention_mask(self, value):
         assert isinstance(value, bool)
         self.has_attention_mask = value
@@ -316,6 +326,7 @@ def reset_activation_shape(self):
         self.first_output_send = True
         self.pipe_recv_buf = None
         self.grad_layer = None
+        self._grad_layer_buf = []
         self.meta_buffer = None
 
         self.pipe_partition_input_meta_cache = None
@@ -924,51 +935,38 @@ def _send_tensor_meta(self, buffer, recv_stage):
                 * ndims
                 * shape
         """
-        send_bytes = 0
+        meta_buffer = torch.empty(TENSOR_META_SIZE, dtype=torch.int32, device=self.device)
         if isinstance(buffer, torch.Tensor):
-            type_tensor = torch.LongTensor(data=[0]).to(self.device)
-            p2p.send(type_tensor, recv_stage)
-            send_shape = torch.LongTensor(data=buffer.size()).to(self.device)
-            send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(self.device)
-            p2p.send(send_ndims, recv_stage)
-            p2p.send(send_shape, recv_stage)
-            send_bytes += _tensor_bytes(buffer)
-        elif isinstance(buffer, list):
-            assert (False)
-            type_tensor = torch.LongTensor(data=[1]).to(self.device)
-            p2p.send(type_tensor, recv_stage)
-            count_tensor = torch.LongTensor(data=[len(buffer)]).to(self.device)
-            p2p.send(count_tensor, recv_stage)
-            for tensor in buffer:
-                assert isinstance(tensor, torch.Tensor)
-                send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
-                send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
-                p2p.send(send_ndims, recv_stage)
-                p2p.send(send_shape, recv_stage)
-                send_bytes += _tensor_bytes(tensor)
+            meta_buf_list = [
+                0,  # type of data (0: tensor, 1: list (unused), 2: tuple)
+                self.DTYPE_TO_ID[buffer.dtype],  # dtype
+                len(buffer.size())  # ndims
+            ]
+            meta_buf_list.extend(buffer.size())
+            assert len(
+                meta_buf_list
+            ) <= TENSOR_META_SIZE, f"Buffer for metadata is too small. Current buffer size: {TENSOR_META_SIZE} but required {len(meta_buf_list)}"
+            meta_buffer[:len(meta_buf_list)].copy_(torch.tensor(meta_buf_list, dtype=torch.int32))
+            p2p.send(meta_buffer, recv_stage)
+
         elif isinstance(buffer, tuple):
-            type_tensor = torch.LongTensor(data=[2]).to(self.device)
-            p2p.send(type_tensor, recv_stage)
-            count_tensor = torch.LongTensor(data=[len(buffer)]).to(self.device)
-            p2p.send(count_tensor, recv_stage)
-            for idx, tensor in enumerate(buffer):
+            meta_buf_list = [
+                2,  # type of data (0: tensor, 1: list (unused), 2: tuple)
+                len(buffer)  # num_tensors
+            ]
+
+            for tensor in buffer:
                 assert isinstance(tensor, torch.Tensor)
-                send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
-                send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
-                send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(self.device)
-                p2p.send(send_dtype, recv_stage)
-                p2p.send(send_ndims, recv_stage)
-                p2p.send(send_shape, recv_stage)
-                # Useful for performance debugging.
-                '''
-                new_bytes = _tensor_bytes(tensor)
-                send_bytes += _tensor_bytes(tensor)
-                # Useful for performance debugging.
-                if self.grid.data_parallel_id == 0:
-                    print(
-                        f'STAGE={self.stage_id} pipe-send-volume[{idx}]: shape={send_shape} {new_bytes/1024**2:0.2f}MB'
-                    )
-                '''
+                meta_buf_list.append(self.DTYPE_TO_ID[tensor.dtype])
+                meta_buf_list.append(len(tensor.size()))
+                meta_buf_list.extend(tensor.size())
+
+            assert len(
+                meta_buf_list
+            ) <= TENSOR_META_SIZE, f"Buffer for metadata is too small. Current buffer size: {TENSOR_META_SIZE} but required {len(meta_buf_list)}"
+            meta_buffer[:len(meta_buf_list)].copy_(torch.tensor(meta_buf_list, dtype=torch.int32))
+            p2p.send(meta_buffer, recv_stage)
+
         else:
             raise NotImplementedError(f'Could not send meta type {type(buffer)}')
 
@@ -981,49 +979,35 @@ def _send_tensor_meta(self, buffer, recv_stage):
     def _recv_tensor_meta(self, send_stage):
         """Receive metadata about upcoming p2p transfers and return allocated buffers.
 
-        Metadata is communicated in this order:
-            * type (0: tensor, 1: list)
-            * num_tensors if type=list
-            foreach tensor in buffer:
-                * ndims
-                * shape
-
         Returns:
             Allocated buffer for receiving from send_stage.
         """
+        buffer = torch.empty(TENSOR_META_SIZE, dtype=torch.int32, device=self.device)
+        p2p.recv(buffer, send_stage)
 
-        type_tensor = torch.LongTensor(data=[0]).to(self.device)
-        p2p.recv(type_tensor, send_stage)
-        recv_type = type_tensor.item()
+        recv_type = buffer[0].item()
 
         # A single tensor will be sent.
         if recv_type == 0:
-            recv_ndims = torch.LongTensor(data=[0]).to(self.device)
-            p2p.recv(recv_ndims, send_stage)
-            recv_ndims = recv_ndims.item()
-            recv_shape = torch.LongTensor([1] * recv_ndims).to(self.device)
-            p2p.recv(recv_shape, send_stage)
-            recv_shape = recv_shape.tolist()
-            return self._allocate_buffer(recv_shape, num_buffers=1)[0]
-
-        # List or tuple of tensors
+            recv_dtype = self.ID_TO_DTYPE[buffer[1].item()]
+            recv_ndims = buffer[2].item()
+            recv_shape = buffer[3:3 + recv_ndims].tolist()
+            return self._allocate_or_extend_buffers(0, recv_shape, recv_dtype)
+
+        # List or tuple of tensors (recv_type == 1 (list) is currently unused)
         elif recv_type == 1 or recv_type == 2:
-            count_tensor = torch.LongTensor(data=[0]).to(self.device)
-            p2p.recv(count_tensor, send_stage)
-            num_tensors = count_tensor.item()
-            recv_shapes_and_dtypes = []
+            num_tensors = buffer[1].item()
+
+            buffers = []
+            offset = 2
             for idx in range(num_tensors):
-                recv_dtype = torch.LongTensor(data=[0]).to(self.device)
-                p2p.recv(recv_dtype, send_stage)
-                recv_dtype = self.ID_TO_DTYPE[recv_dtype.item()]
-                recv_ndims = torch.LongTensor(data=[0]).to(self.device)
-                p2p.recv(recv_ndims, send_stage)
-                recv_ndims = recv_ndims.item()
-                recv_shape = torch.LongTensor([1] * recv_ndims).to(self.device)
-                p2p.recv(recv_shape, send_stage)
-                recv_shapes_and_dtypes.append((recv_shape.tolist(), recv_dtype))
-
-            buffers = self._allocate_buffers(recv_shapes_and_dtypes, num_buffers=1)[0]
+                recv_dtype = self.ID_TO_DTYPE[buffer[offset].item()]
+                recv_ndims = buffer[offset + 1].item()
+                recv_shape = buffer[offset + 2:offset + 2 + recv_ndims].tolist()
+                offset += 2 + recv_ndims
+
+                buffers.append(self._allocate_or_extend_buffers(idx, recv_shape, recv_dtype))
+
             # Convert to tuples if requested.
             if recv_type == 2:
                 buffers = tuple(buffers)
@@ -1046,7 +1030,7 @@ def _exec_send_activations(self, buffer_id):
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
 
-        if self.first_output_send:
+        if self.dynamic_shape or self.first_output_send:
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
 
@@ -1131,7 +1115,7 @@ def _exec_recv_activations(self, buffer_id):
         recvd = None
 
         # Allocate the buffer if necessary
-        if self.pipe_recv_buf is None:
+        if self.dynamic_shape or self.pipe_recv_buf is None:
             self.pipe_recv_buf = self._recv_tensor_meta(self.prev_stage)
 
         if isinstance(self.pipe_recv_buf, torch.Tensor):
@@ -1186,10 +1170,9 @@ def _exec_recv_grads(self, buffer_id):
             self.pipe_buffers['outputs'][buffer_id] = outputs
 
         # Allocate gradient if necessary
-        if self.grad_layer is None:
+        if self.dynamic_shape or self.grad_layer is None:
             if isinstance(outputs, torch.Tensor):
-                s = list(outputs.size())
-                self.grad_layer = self._allocate_buffer(s, dtype=outputs.dtype, num_buffers=1)[0]
+                self.grad_layer = self._allocate_or_extend_buffers(0, list(outputs.size()), outputs.dtype)
             else:
                 # XXX This is a HACK
                 # When we exchange activations/gradients, the two pipe stages
@@ -1209,7 +1192,11 @@ def _exec_recv_grads(self, buffer_id):
                     sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs[:2]]
                 else:
                     sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs if t.is_floating_point()]
-                self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0]
+
+                self.grad_layer = [
+                    self._allocate_or_extend_buffers(i, size, dtype)
+                    for i, (size, dtype) in enumerate(sizes_and_dtypes)
+                ]
 
         if isinstance(self.grad_layer, torch.Tensor):
             p2p.recv(self.grad_layer, self.next_stage)
@@ -1290,16 +1277,17 @@ def _allocate_buffer(self, shape, num_buffers=-1, **kwargs):
             buffers.append(self._allocate_zeros(shape, **kwargs))
         return buffers
 
-    def _allocate_buffers(self, shapes_and_dtypes, requires_grad=False, num_buffers=-1):
-        buffers = []
-        if num_buffers == -1:
-            num_buffers = self.num_pipe_buffers
-        for count in range(num_buffers):
-            buffer = []
-            for shape, dtype in shapes_and_dtypes:
-                buffer.append(self._allocate_zeros(shape, dtype=dtype, requires_grad=requires_grad))
-            buffers.append(buffer)
-        return buffers
+    def _allocate_or_extend_buffers(self, idx, shape, dtype):
+        numel = reduce(mul, shape) if len(shape) > 0 else 1
+        if len(self._grad_layer_buf) <= idx or self._grad_layer_buf[idx].numel() < numel:
+            new_buf = self._allocate_buffer(shape, dtype=dtype, num_buffers=1)[0]
+            if len(self._grad_layer_buf) <= idx:
+                self._grad_layer_buf.append(new_buf)
+            else:
+                self._grad_layer_buf[idx] = new_buf
+            return self._grad_layer_buf[idx]
+        else:
+            return self._grad_layer_buf[idx].flatten()[:numel].view(shape)
 
     def forward(self, *args, **kwargs):
         """Disabled for pipeline parallel training. See ``train_batch()``. """
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 8036faef72eeb..31fec30be7880 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -117,6 +117,7 @@ def forward(self, inputs):
         activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing.
         activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
         checkpointable_layers(list, optional): Checkpointable layers may not be checkpointed. Defaults to None which does not additional filtering.
+        dynamic_shape: Allows dynamic shapes of inputs. This might have a performance impact.
     """
 
     def __init__(self,
@@ -130,7 +131,8 @@ def __init__(self,
                  partition_method='parameters',
                  activation_checkpoint_interval=0,
                  activation_checkpoint_func=checkpointing.checkpoint,
-                 checkpointable_layers=None):
+                 checkpointable_layers=None,
+                 dynamic_shape=False):
 
         super().__init__()
 
@@ -196,6 +198,16 @@ def __init__(self,
         #newseed = get_accelerator().initial_seed() + self._grid.get_stage_id()
         #ds_utils.set_random_seed(newseed)
 
+        self.activation_checkpoint_interval = activation_checkpoint_interval
+
+        self.activation_checkpoint_func = activation_checkpoint_func
+
+        #storage for precomputed checkpointeble results
+        self.is_checkpointable_results = []
+        self.is_checkpointable_results_interval = None
+
+        # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint``
+
         #with torch.random.fork_rng(devices=[get_accelerator().current_device_name()]):
         self._build()
         self.to(get_accelerator().device_name(self.local_rank))
@@ -203,10 +215,17 @@ def __init__(self,
         self.tied_comms = self._index_tied_modules()
         self._synchronize_tied_weights()
 
-        self.activation_checkpoint_interval = activation_checkpoint_interval
+        self.dynamic_shape = dynamic_shape
 
-        self.activation_checkpoint_func = activation_checkpoint_func
-        # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint``
+    def _precompute_checkpointable_values(self):
+        if self.activation_checkpoint_interval > 0 and self.is_checkpointable_results_interval != self.activation_checkpoint_interval:
+            num_layers = len(self.forward_funcs)
+            self.interval_was_zero = False
+            for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
+                end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers)
+                funcs = self.forward_funcs[start_idx:end_idx]
+                self.is_checkpointable_results.append(self._is_checkpointable(funcs))
+            self.is_checkpointable_results_interval = self.activation_checkpoint_interval
 
     def _build(self):
         specs = self._layer_specs
@@ -352,7 +371,9 @@ def exec_func(*inputs):
         else:
             num_layers = len(self.forward_funcs)
             x = forward_input
-            for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
+            for start_idx, is_checkpointable_result in \
+                zip(range(0, num_layers, self.activation_checkpoint_interval), self.is_checkpointable_results):
+
                 end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers)
 
                 funcs = self.forward_funcs[start_idx:end_idx]
@@ -361,7 +382,7 @@ def exec_func(*inputs):
                 if not isinstance(x, tuple):
                     x = (x, )
 
-                if self._is_checkpointable(funcs):
+                if is_checkpointable_result:
                     x = self.activation_checkpoint_func(exec_range_func(start_idx, end_idx), *x)
                 else:
                     x = exec_range_func(start_idx, end_idx)(*x)
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
index df4a38380089a..46c3f2a0c9543 100644
--- a/deepspeed/runtime/swap_tensor/aio_config.py
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -5,25 +5,33 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.accelerator import get_accelerator
 
 AIO_DEFAULT_DICT = {
     AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
     AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
     AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
     AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
-    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT
+    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT,
+    AIO_USE_GDS: AIO_USE_GDS_DEFAULT
 }
 
 
 def get_aio_config(param_dict):
     if AIO in param_dict.keys() and param_dict[AIO] is not None:
         aio_dict = param_dict[AIO]
-        return {
+        aio_config = {
             AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
             AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
             AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT),
             AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
-            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT)
+            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT),
+            AIO_USE_GDS: get_scalar_param(aio_dict, AIO_USE_GDS, AIO_USE_GDS_DEFAULT)
         }
 
+        if aio_config[AIO_USE_GDS]:
+            assert get_accelerator().device_name() == 'cuda', 'GDS currently only supported for CUDA accelerator'
+
+        return aio_config
+
     return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
index 4c9722bc4e4f2..cee20ac7b78c4 100644
--- a/deepspeed/runtime/swap_tensor/constants.py
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -11,7 +11,8 @@
   "queue_depth": 8,
   "thread_count": 1,
   "single_submit": false,
-  "overlap_events": true
+  "overlap_events": true,
+  "use_gds": false
 }
 '''
 AIO = "aio"
@@ -25,3 +26,5 @@
 AIO_SINGLE_SUBMIT_DEFAULT = False
 AIO_OVERLAP_EVENTS = "overlap_events"
 AIO_OVERLAP_EVENTS_DEFAULT = True
+AIO_USE_GDS = "use_gds"
+AIO_USE_GDS_DEFAULT = False
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index fcc6a272883f2..26fbf6164d541 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -13,6 +13,7 @@
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
 from .constants import *
 from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, AIO_ALIGNED_BYTES, print_object, SwapBufferPool
 
@@ -37,8 +38,6 @@ class AsyncPartitionedParameterSwapper(object):
 
     def __init__(self, ds_config, model_dtype):
 
-        aio_op = AsyncIOBuilder().load(verbose=False)
-        self.aio_handle = aio_op.aio_handle
         self.dtype = model_dtype
 
         #set swap buffers, create aio handles
@@ -93,6 +92,10 @@ def _configure_aio(self, ds_config):
 
         self.aio_config = ds_config.aio_config
 
+        self.use_gds = self.aio_config[AIO_USE_GDS]
+        self.aio_handle = GDSBuilder().load(verbose=False).gds_handle if self.use_gds else AsyncIOBuilder().load(
+            verbose=False).aio_handle
+
         # Read/Write alignment for each thread during Intra-request parallelism
         self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
         self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_THREAD_COUNT]
@@ -104,11 +107,6 @@ def _configure_aio(self, ds_config):
 
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
         self.reserved_buffer_ids = []
-        self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
-                                                                    self.param_buffer_count),
-                                                                dtype=self.dtype,
-                                                                requires_grad=False),
-                                                    align_bytes=0)
 
         self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
                                                self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
@@ -118,6 +116,19 @@ def _configure_aio(self, ds_config):
                                                 self.aio_config[AIO_SINGLE_SUBMIT],
                                                 self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
 
+        if self.use_gds:
+            self.buffers = torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count),
+                                       dtype=self.dtype,
+                                       device=get_accelerator().device_name(),
+                                       requires_grad=False)
+            self.aio_read_handle.pin_device_tensor(self.buffers)
+        else:
+            self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
+                                                                        self.param_buffer_count),
+                                                                    dtype=self.dtype,
+                                                                    requires_grad=False),
+                                                        align_bytes=0)
+
         self.swap_out_params = []
 
     #Check if partitioned param or numel in a tensor is swappable or not
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index 1ccca09a9e698..23fcf9ec13fbb 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -13,3 +13,5 @@
 from .tiling import TiledLinearReturnBias
 
 from .mics import MiCS_Init
+
+from .stage3 import unwrap_model_for_generation
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index e9dd78864cdee..8c8db60768ebc 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -16,6 +16,7 @@
 #when implemented outside of torch.autograd.Function
 
 import math
+import functools
 
 import torch
 from torch import Tensor
@@ -33,8 +34,14 @@ def print_rank_0(message, debug=False, force=False):
 
 
 try:
-    autocast_custom_fwd = get_accelerator().amp().custom_fwd
-    autocast_custom_bwd = get_accelerator().amp().custom_bwd
+    # Fix `torch.[device].amp.custom_fwd/bwd` FutureWarning in torch 2.4
+    if hasattr(torch, 'amp') and hasattr(torch.amp, 'custom_fwd') and hasattr(torch.amp, 'custom_bwd'):
+        autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=get_accelerator().device_name())
+        autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=get_accelerator().device_name())
+    else:
+        # original implementation
+        autocast_custom_fwd = get_accelerator().amp().custom_fwd
+        autocast_custom_bwd = get_accelerator().amp().custom_bwd
 except (ImportError, AttributeError) as exp:
     autocast_custom_fwd = noop_decorator
     autocast_custom_bwd = noop_decorator
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index e9e79c2647fbe..92702edb733b1 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -17,9 +17,6 @@
 
 FWD_MODULE_STACK = list()
 
-# ensure we only warn once, otherwise every iteration will trigger a warning
-warned = False
-
 
 #for each tensor in outputs run the forward_function and register backward_function as hook
 def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, outputs):
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 37b81d42c0d69..796957a4c6e5a 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -7,6 +7,7 @@
 import gc
 import collections
 from typing import Deque, Dict, Tuple
+from contextlib import contextmanager
 from deepspeed import comm as dist
 from deepspeed.utils import groups
 
@@ -15,7 +16,7 @@
 from deepspeed.utils import logger
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
 from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item
+from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
@@ -69,6 +70,39 @@ def move_to_cpu(tensor_list):
         tensor.data = tensor.data.cpu()
 
 
+@contextmanager
+def unwrap_model_for_generation(model):
+    """
+    For ZeRO-3 models, we gather the weights once to speed up generation.
+    """
+    with GatheredParameters(model.parameters()):
+        # Removes the optimizer hooks from a DeepSpeed ZeRO-3 model.
+
+        # Remove hooks
+        if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
+            optimizer_offload = model.optimizer.parameter_offload
+        elif model.optimizer is not None:
+            optimizer_offload = model.optimizer
+
+        for hook in optimizer_offload.forward_hooks:
+            hook.remove()
+        for hook in optimizer_offload.backward_hooks:
+            hook.remove()
+
+        optimizer_offload.forward_hooks = []
+        optimizer_offload.backward_hooks = []
+
+        yield model
+
+        # Adds the optimizer hooks from a DeepSpeed ZeRO-3 model.
+        if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
+            optimizer_offload = model.optimizer.parameter_offload
+        elif model.optimizer is not None:
+            optimizer_offload = model.optimizer
+        optimizer_offload._register_hooks_recursively(optimizer_offload.module)
+    return
+
+
 INITIAL_MICRO_STEP_ID = -1
 
 
@@ -215,14 +249,12 @@ def __init__(
         self.module = module
         self.elastic_checkpoint = elastic_checkpoint
 
-        self.inf_or_nan_tracker: Tensor = torch.zeros(1,
-                                                      dtype=torch.bool,
-                                                      device=get_accelerator().current_device_name(),
-                                                      requires_grad=False)
+        self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu
+
+        self.inf_or_nan_tracker: Tensor = torch.zeros(1, dtype=torch.bool, device=self.device, requires_grad=False)
 
         self.deepspeed_adam_offload = (self.offload_optimizer and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu
         ### streams used for overlapping computation with communication
         self.reduce_and_partition_stream = None if get_accelerator().is_synchronized_device() else get_accelerator(
         ).Stream() if overlap_comm else get_accelerator().default_stream()
@@ -821,10 +853,14 @@ def _create_fp32_partitions(self):
 
         for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
             num_elements = self.fp16_partitioned_groups_flat_numel[i]
+            ds_id_begin = str(self.fp16_partitioned_groups_flat_id[i][0])
+            ds_id_end = str(self.fp16_partitioned_groups_flat_id[i][-1])
+            ds_id = ds_id_begin + '_' + ds_id_end
 
             # a partition of the fp32 master weights that will be updated by this process
             if self._swappable_optimizer_subgroup(i):
                 self.fp32_partitioned_groups_flat.append(torch.Tensor())
+                self.fp32_partitioned_groups_flat[i].ds_id = ds_id
                 nvme_memory_usage += (fp32_element_size * num_elements)
                 num_swappable_partitions += 1
 
@@ -861,11 +897,9 @@ def _create_fp32_partitions(self):
                     else:
                         self.fp32_partitioned_groups_flat.append(self.fp16_partitioned_groups_flat[i].to(
                             self.device).clone().float().detach())
+                self.fp32_partitioned_groups_flat[i].ds_id = ds_id
 
             self.fp32_partitioned_groups_flat[i].requires_grad = True  # keep this in case internal optimizer uses it
-            ds_id_begin = str(self.fp16_partitioned_groups_flat_id[i][0])
-            ds_id_end = str(self.fp16_partitioned_groups_flat_id[i][-1])
-            self.fp32_partitioned_groups_flat[i].ds_id = ds_id_begin + '_' + ds_id_end
 
         if len(swappable_fp32_tensors) > 0:
             self.optimizer_swapper.initialize_parameters(parameters=swappable_fp32_tensors,
@@ -1411,7 +1445,7 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
         err = torch.tensor(-1.0, device=inf_or_nan.device, dtype=torch.float)
         total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
 
-        return total_norm
+        return total_norm.cpu()
 
     @instrument_w_nvtx
     def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
@@ -2026,7 +2060,7 @@ def step(self, closure=None):
             return
 
         norm_groups = self._get_norm_groups()
-        scaled_global_grad_norm = get_global_norm(norm_list=norm_groups)
+        scaled_global_grad_norm = torch.linalg.norm(torch.stack(norm_groups))
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.loss_scale
@@ -2110,8 +2144,8 @@ def unscale_and_clip_grads(self, sub_group_id, total_norm):
         if self.clip_grad > 0.:
             # norm is in fact norm*scale
             clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
+            clip = torch.clamp(clip, min=1.0)
+            combined_scale = clip * self.loss_scale
 
         self.fp32_partitioned_groups_flat[sub_group_id].grad.mul_(1. / combined_scale)
 
@@ -2146,7 +2180,8 @@ def has_overflow(self, partition_gradients=True):
                     self.inf_or_nan_tracker += torch.isnan(self.grad_partitions_flat_buffer).any()
                     self.inf_or_nan_tracker = self.inf_or_nan_tracker > 0
 
-                overflow_gpu = self.inf_or_nan_tracker.clone().to(torch.uint8)
+                overflow_gpu = self.inf_or_nan_tracker.clone().to(get_accelerator().current_device_name()).to(
+                    torch.uint8)
                 self.inf_or_nan_tracker.zero_()
 
             if not get_accelerator().resolves_data_dependency():
@@ -2237,7 +2272,7 @@ def get_fp32_grad_partitions(self) -> Dict[int, Dict[int, Tensor]]:
         return grad_dict
 
     def _fp32_state_allgather(self, param, fp32_state_partition):
-        reduce_buffer = torch.zeros(self.partition_count * fp32_state_partition.numel(),
+        reduce_buffer = torch.empty(self.partition_count * fp32_state_partition.numel(),
                                     dtype=torch.float32,
                                     device=param.device)
         my_rank = dist.get_rank(group=self.dp_process_group)
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 461ad719536ae..57e80911d6451 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -972,6 +972,8 @@ def print_rank_0(self, message):
             logger.info(message)
 
     def gradient_reduction_w_predivide(self, tensor):
+        if tensor.size().numel() == 0:
+            return tensor
 
         dp_world_size = dist.get_world_size(group=self.dp_process_group)
 
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index f61715bd43873..8f913d065934a 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -16,6 +16,9 @@
 from deepspeed.utils.nvtx import instrument_w_nvtx
 from deepspeed.accelerator import get_accelerator
 
+# ensure we only warn once, otherwise every iteration will trigger a warning
+warned = False
+
 
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
     data_parallel_size = int(dist.get_world_size())
@@ -65,7 +68,6 @@ def get_lst_from_rank0(lst: List[int]) -> None:
     lst_tensor = torch.tensor(
         lst if dist.get_rank() == 0 else [-1] * len(lst),
         dtype=int,
-        # device=get_accelerator().current_device_name(),
         device=torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])),
         requires_grad=False,
     )
diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py
index a715843d8eff6..e809fe1118b51 100644
--- a/deepspeed/sequence/layer.py
+++ b/deepspeed/sequence/layer.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-
 import torch
 
 from typing import Any, Tuple
@@ -10,51 +9,137 @@
 from torch.nn import Module
 
 import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+
+
+def post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, seq_len, num_head, head_dim):
+
+    def post_func(input):
+        if batch_dim_idx == 0:
+            # b, s, n, h
+            if scatter_idx < 2:
+                output = input.permute(1, 2, 0, 3, 4).contiguous()
+                output = output.reshape(bs, seq_len // seq_world_size, seq_world_size * num_head,
+                                        head_dim).contiguous()
+            else:
+                output = input.permute(1, 0, 2, 3, 4).contiguous()
+                output = output.reshape(bs, seq_world_size * seq_len, num_head // seq_world_size,
+                                        head_dim).contiguous()
+        else:
+            # s, b, n, h
+            if scatter_idx < 2:
+                output = input.permute(1, 2, 0, 3, 4).contiguous()
+                output = output.reshape(seq_len // seq_world_size, bs, seq_world_size * num_head,
+                                        head_dim).contiguous()
+            else:
+                output = input.reshape(seq_len * seq_world_size, bs, num_head // seq_world_size, head_dim).contiguous()
+        return output
 
+    return post_func
 
-def single_all_to_all(input, scatter_idx, gather_idx, group):
+
+def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, async_op=False, handle=None, type=None):
     seq_world_size = dist.get_world_size(group)
-    inp_shape = list(input.shape)
-    inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+    if batch_dim_idx == 0:
+        # b, s, n, h
+        if scatter_idx < 2:
+            bs, global_seq_len, num_local_head, head_dim = input.shape
+            input_t = input.reshape([bs, seq_world_size, global_seq_len // seq_world_size, num_local_head,
+                                     head_dim]).contiguous()
+            input_t = input_t.permute(1, 0, 2, 3, 4).contiguous()
+        else:
+            bs, local_seq_len, num_total_head, head_dim = input.shape
+            assert num_total_head % seq_world_size == 0, f"Number of heads ({num_total_head}) must be divisible by the sequence parallel size ({seq_world_size})!"
+            input_t = input.reshape([bs, local_seq_len, seq_world_size, num_total_head // seq_world_size,
+                                     head_dim]).contiguous()
+            input_t = input_t.permute(2, 0, 1, 3, 4).contiguous()
+    else:
+        # s, b, n, h
+        if scatter_idx < 2:
+            global_seq_len, bs, num_local_head, head_dim = input.shape
+            input_t = input.reshape([seq_world_size, global_seq_len // seq_world_size, bs, num_local_head,
+                                     head_dim]).contiguous()
+        else:
+            local_seq_len, bs, num_total_head, head_dim = input.shape
+            assert num_total_head % seq_world_size == 0, f"Number of heads ({num_total_head}) must be divisible by the sequence parallel size ({seq_world_size})!"
+            input_t = input.reshape([local_seq_len, bs, seq_world_size, num_total_head // seq_world_size,
+                                     head_dim]).contiguous()
+            input_t = input_t.permute(2, 0, 1, 3, 4).contiguous()
+
     if scatter_idx < 2:
-        input_t = input.reshape(
-            [seq_world_size, inp_shape[scatter_idx]] + \
-            inp_shape[scatter_idx + 1:]
-        ).contiguous()
+        post_all2all_fun = post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, global_seq_len, num_local_head,
+                                        head_dim)
     else:
-        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
-        input_t = input.reshape(
-            [-1, seq_world_size, inp_shape[scatter_idx]] + \
-            inp_shape[scatter_idx + 1:]
-        ).transpose(0, 1).contiguous()
+        post_all2all_fun = post_all2all(scatter_idx, batch_dim_idx, seq_world_size, bs, local_seq_len, num_total_head,
+                                        head_dim)
 
     output = torch.empty_like(input_t)
-    dist.all_to_all_single(output, input_t, group=group)
+    work = dist.all_to_all_single(output, input_t, group=group, async_op=async_op)
 
-    # if scattering the seq-dim, transpose the heads back to the original dimension
-    if scatter_idx < 2:
-        output = output.transpose(0, 2).contiguous()
+    if async_op:
+        if type in ('dq', 'dk'):
+            handle[type + '_work'] = work
+            handle[type + '_grad'] = output
+            handle[type + '_post_all2all_func'] = post_all2all_fun
+            return output
 
-    return output.reshape(
-        inp_shape[: gather_idx] + \
-        [inp_shape[gather_idx] * seq_world_size,] + \
-        inp_shape[gather_idx + 1:]).contiguous()
+    res = post_all2all_fun(output)
+    return res
 
 
 class _SeqAllToAll(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
-
+    def forward(ctx: Any,
+                group: dist.ProcessGroup,
+                input: Tensor,
+                scatter_idx: int,
+                gather_idx: int,
+                batch_dim_idx: int,
+                stream=None,
+                handle=None,
+                type=None,
+                is_fwd=True) -> Tensor:
         ctx.group = group
         ctx.scatter_idx = scatter_idx
         ctx.gather_idx = gather_idx
-
-        return single_all_to_all(input, scatter_idx, gather_idx, group)
+        ctx.stream = stream
+        ctx.handle = handle
+        ctx.type = type
+        ctx.batch_dim_idx = batch_dim_idx
+        if ctx.handle is None:
+            res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False)
+
+        else:
+            # overlap communication path
+            if not is_fwd and type == 'o':
+                assert ctx.stream != None
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False)
+                get_accelerator().current_stream().wait_stream(ctx.stream)
+                del ctx.stream.activation_buffer_list
+                # The computation of d o_weight can overlap with the communication of d o_input
+
+            elif not is_fwd and type in ('q', 'k'):
+                # Achieve communication overlap by pipelining the matrix computation and communication of dq, dk, and dv
+                type = 'd' + type
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, True, handle, type)
+
+            elif is_fwd and type in ('q', 'k'):
+                # Achieve communication overlap by pipelining the matrix computation and communication of q, k, and v
+                type = 'fwd_' + type
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False, handle, type)
+
+            else:
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False)
+
+        return res
 
     @staticmethod
     def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
-        return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None)
+
+        return (None,
+                _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx, ctx.batch_dim_idx,
+                                   ctx.stream, ctx.handle, ctx.type, False), None, None, None, None, None, None, None)
 
 
 class DistributedAttention(torch.nn.Module):
@@ -73,6 +158,7 @@ def __init__(
         sequence_process_group: dist.ProcessGroup,
         scatter_idx: int = 2,
         gather_idx: int = 0,
+        sp_stream=None,
     ) -> None:
 
         super(DistributedAttention, self).__init__()
@@ -80,30 +166,78 @@ def __init__(
         self.spg = sequence_process_group
         self.scatter_idx = scatter_idx
         self.gather_idx = gather_idx
-
-    def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any, **kwargs) -> Tensor:
+        self.sp_overlap_comm = False
+        self.overlap_handles = None
+        self.sp_stream = sp_stream
+        if sp_stream is not None:
+            self.overlap_handles = {}
+            self.sp_overlap_comm = True
+            self.dafult_stream = get_accelerator().default_stream()
+
+    def layer_sync(self, layer):
+        if self.sp_overlap_comm and hasattr(layer, 'done_event'):
+            self.dafult_stream.wait_event(layer.done_event)
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, batch_dim_idx: int, *args: Any, **kwargs) -> Tensor:
         """ forward
 
         Arguments:
             query (Tensor): query input to the layer
             key (Tensor): key input to the layer
             value (Tensor): value input to the layer
+            batch_dim_idx (int): indicating which dim is batch
             args: other args
 
         Returns:
             * output (Tensor): context output
         """
+
         # TODO Merge three alltoall calls into one
         # TODO (Reza): change the api on the megatron-deepspeed side so that we only receive all data (q,k, and v) together!
         #in shape : e.g.,  [s/p:h:]
-        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx)
-        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
-        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
+
+        def bwd_hook(layer_type):
+
+            def pre_hook_fun(grad):
+                type = 'd' + layer_type
+                self.overlap_handles[type + '_work'].wait()
+                self.sp_stream.wait_stream(self.dafult_stream)
+                all2all_output = self.overlap_handles[type + '_grad']
+                grad = list(grad)
+                grad[0] = self.overlap_handles[type + '_post_all2all_func'](all2all_output)
+                grad = tuple(grad)
+
+            return pre_hook_fun
+
+        self.layer_sync(query)
+        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx, batch_dim_idx, None,
+                                         self.overlap_handles, 'q')
+        self.layer_sync(key)
+        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx, batch_dim_idx, None,
+                                       self.overlap_handles, 'k')
+        if self.sp_overlap_comm:
+            self.dafult_stream.wait_stream(self.sp_stream)
+
+        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx, batch_dim_idx, None,
+                                         self.overlap_handles, 'v')
+
+        if self.sp_overlap_comm:
+            # Register a hook to synchronize dq and dk after the all-to-all
+            # operation when the gradient data is used.
+            # Place this logic after the q, k, v all-to-all operation to
+            # improve interpreter speed to
+            # call and launch of the forward all-to-all communication.
+            grad_fn_q = query.grad_fn.next_functions[0][0]
+            grad_fn_q.register_prehook(bwd_hook(layer_type='q'))
+            grad_fn_k = key.grad_fn.next_functions[0][0]
+            grad_fn_k.register_prehook(bwd_hook(layer_type='k'))
 
         #out shape : e.g., [s:h/p:]
+
         context_layer = self.local_attn(query_layer, key_layer, value_layer, *args, **kwargs)
 
-        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx)
+        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx, batch_dim_idx,
+                                    self.sp_stream, self.overlap_handles, 'o')
 
         #out e.g., [s/p::h]
         return output
diff --git a/docs/Gemfile b/docs/Gemfile
index 888e3c8dfd6a7..f40c61e4575fc 100644
--- a/docs/Gemfile
+++ b/docs/Gemfile
@@ -20,3 +20,5 @@ end
 
 # Performance-booster for watching directories on Windows
 gem "wdm", "~> 0.1.1", :install_if => Gem.win_platform?
+
+gem "webrick", "~> 1.8"
diff --git a/docs/README.md b/docs/README.md
index 0c3aaaeda6005..7333a119c7be3 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -42,6 +42,16 @@ We now need to install the required Ruby packages for the website.
 bundle install
 ```
 
+Depending on your environment, you may need to add `webrick` to avoid the following [error](https://talk.jekyllrb.com/t/load-error-cannot-load-such-file-webrick/5417/6):
+
+> gems/gems/jekyll-3.9.5/lib/jekyll/commands/serve/servlet.rb:3:in `require': cannot load such file -- webrick (LoadError)
+
+
+```
+bundle add webrick
+```
+
+
 You can now start a local webserver via:
 ```
 bundle exec jekyll serve
diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md
index 88a43236ce9dd..d7c153638c0d8 100644
--- a/docs/_tutorials/accelerator-abstraction-interface.md
+++ b/docs/_tutorials/accelerator-abstraction-interface.md
@@ -12,7 +12,6 @@ tags: getting-started
   - [Tensor operations](#tensor-operations)
   - [Communication backend](#communication-backend)
 - [Run DeepSpeed model on different accelerators](#run-deepspeed-model-on-different-accelerators)
-- [Run DeepSpeed model on CPU](#run-deepspeed-model-on-cpu)
 - [Implement new accelerator extension](#implement-new-accelerator-extension)
 
 # Introduction
@@ -79,69 +78,9 @@ torch.distributed.init_process_group(get_accelerator().communication_backend_nam
 ```
 
 # Run DeepSpeed model on different accelerators
-Once a model is ported with DeepSpeed Accelerator Abstraction Interface, we can run this model on different accelerators using an extension to DeepSpeed. DeepSpeed checks whether a certain extension is installed in the environment to decide whether to use the Accelerator backend in that extension. For example, if we wish to run a model on Intel GPU, we can install _Intel Extension for DeepSpeed_ following the instructions in the following [link](https://github.com/intel/intel-extension-for-deepspeed/)
-
-After the extension is installed, install DeepSpeed and run the model.  The model will be running on top of DeepSpeed.   Because DeepSpeed installation is also accelerator related, it is recommended to install DeepSpeed accelerator extension before installing DeepSpeed.
-
-`CUDA_Accelerator` is the default accelerator in DeepSpeed.  If no other DeepSpeed accelerator extension is installed, `CUDA_Accelerator` will be used.
-
-When running a model on different accelerators in a cloud environment, the recommended practice is to provision an environment for each accelerator in a different env with tools such as _anaconda/miniconda/virtualenv_.  When running models on different Accelerator, load the env accordingly.
-
-Note that different accelerator may have different 'flavor' of float16 or bfloat16.   So it is recommended to make the model configurable for both float16 and bfloat16, in that way model code does not need to be changed when running on different accelerators.
-
-# Run DeepSpeed model on CPU
-DeepSpeed support using CPU as accelerator.  DeepSpeed model using DeepSpeed Accelerator Abstraction Interface could run on CPU without change to model code.   DeepSpeed decide whether _Intel Extension for PyTorch_ is installed in the environment.  If this packaged is installed, DeepSpeed will use CPU as accelerator.  Otherwise CUDA device will be used as accelerator.
-
-To run DeepSpeed model on CPU, use the following steps to prepare environment:
-
-```
-python -m pip install intel_extension_for_pytorch
-python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu
-git clone https://github.com/oneapi-src/oneCCL
-cd oneCCL
-mkdir build
-cd build
-cmake ..
-make
-make install
-```
-
-Before run CPU workload, we need to source oneCCL environment variables
-```
-source <path-to-oneCCL>/build/_install/env/setvars.sh
-```
-
-After environment is prepared, we can launch DeepSpeed inference with the following command
-```
-deepspeed --bind_cores_to_rank <deepspeed-model-script>
-```
-
-This command would launch number of workers equal to number of CPU sockets on the system.  Currently DeepSpeed support running inference model with AutoTP on top of CPU.  The argument `--bind_cores_to_rank` distribute CPU cores on the system evenly among workers, to allow each worker running on a dedicated set of CPU cores.
-
-On CPU system, there might be daemon process that periodically activate which would increase variance of each worker.  One practice is leave a couple of cores for daemon process using `--bind-core-list` argument:
-
-```
-deepspeed --bind_cores_to_rank --bind_core_list 0-51,56-107 <deepspeed-model-script>
-```
-
-The command above leave 4 cores on each socket to daemon process (assume two sockets, each socket has 56 cores).
-
-We can also set an arbitrary number of workers.  Unlike GPU, CPU cores on host can be further divided into subgroups.  When this number is not set, DeepSpeed would detect number of NUMA nodes on the system and launch one worker for each NUMA node.
-
-```
-deepspeed --num_accelerators 4 --bind_cores_to_rank <deepspeed-model-script>
-```
-
-Launching DeepSpeed model on multiple CPU nodes is similar to other accelerators.  We need to specify `impi` as launcher and specify `--bind_cores_to_rank` for better core binding.  Also specify `slots` number according to number of CPU sockets in host file.
-
-```
-# hostfile content should follow the format
-# worker-1-hostname slots=<#sockets>
-# worker-2-hostname slots=<#sockets>
-# ...
-
-deepspeed --hostfile=<hostfile> --bind_cores_to_rank --launcher impi --master_addr <master-ip> <deepspeed-model-script>
-```
+[Accelerator Setup Guide](accelerator-setup-guide.md) provides a guide on how to setup different accelerators for DeepSpeed.  It also comes with simple example how to run deepspeed for different accelerators.  The following guides are provided:
+1. Run DeepSpeed model on CPU
+2. Run DeepSpeed model on XPU
 
 # Implement new accelerator extension
 It is possible to implement a new DeepSpeed accelerator extension to support new accelerator in DeepSpeed.  An example to follow is _[Intel Extension For DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed/)_.   An accelerator extension contains the following components:
diff --git a/docs/_tutorials/accelerator-setup-guide.md b/docs/_tutorials/accelerator-setup-guide.md
new file mode 100644
index 0000000000000..cf2d01d2b25c9
--- /dev/null
+++ b/docs/_tutorials/accelerator-setup-guide.md
@@ -0,0 +1,134 @@
+---
+title: DeepSpeed Accelerator Setup Guides
+tags: getting-started
+---
+
+# Contents
+- [Contents](#contents)
+- [Introduction](#introduction)
+- [Intel Architecture (IA) CPU](#intel-architecture-ia-cpu)
+- [Intel XPU](#intel-xpu)
+
+# Introduction
+DeepSpeed supports different accelerators from different companies.   Setup steps to run DeepSpeed on certain accelerators might be different.  This guide allows user to lookup setup instructions for the accelerator family and hardware they are using.
+
+# Intel Architecture (IA) CPU
+DeepSpeed supports CPU with Intel Architecture instruction set.  It is recommended to have the CPU support at least AVX2 instruction set and recommend AMX instruction set.
+
+DeepSpeed has been verified on the following CPU processors:
+* 4th Gen Intel® Xeon® Scalarable Processors
+* 5th Gen Intel® Xeon® Scalarable Processors
+* 6th Gen Intel® Xeon® Scalarable Processors
+
+## Installation steps for Intel Architecture CPU
+To install DeepSpeed on Intel Architecture CPU, use the following steps:
+1. Install gcc compiler
+DeepSpeed requires gcc-9 or above to build kernels on Intel Architecture CPU, install gcc-9 or above.
+
+2. Install numactl
+DeepSpeed use `numactl` for fine grain CPU core allocation for load-balancing, install numactl on your system.
+For example, on Ubuntu system, use the following command:
+`sudo apt-get install numactl`
+
+3. Install PyTorch
+`pip install torch`
+
+4. Install DeepSpeed
+`pip install deepspeed`
+
+## How to launch DeepSpeed on Intel Architecture CPU
+DeepSpeed can launch on Intel Architecture CPU with default deepspeed command.  However, for compute intensive workloads, Intel Architecture CPU works best when each worker process runs on different set of physical CPU cores, so worker process does not compete CPU cores with each other.  To bind cores to each worker (rank), use the following command line switch for better performance.
+```
+deepspeed --bind_cores_to_rank <deepspeed-model-script>
+```
+This switch would automatically detect the number of CPU NUMA node on the host, launch the same number of workers, and bind each worker to cores/memory of a different NUMA node.  This improves performance by ensuring workers do not interfere with each other, and that all memory allocation is from local memory.
+
+If a user wishes to have more control on the number of workers and specific cores that can be used by the workload, user can use the following command line switches.
+```
+deepspeed --num_accelerators <number-of-workers> --bind_cores_to_rank --bind_core_list <comma-seperated-dash-range> <deepspeed-model-script>
+```
+For example:
+```
+deepspeed --num_accelerators 4 --bind_cores_to_rank --bind_core_list <0-27,32-59> inference.py
+```
+This would start 4 workers for the workload.  The core list range will be divided evenly between 4 workers, with worker 0 take 0-13, worker 1, take 14-27, worker 2 take 32-45, and worker 3 take 46-59.  Core 28-31,60-63 are left out because there might be some background process running on the system, leaving some idle cores will reduce performance jitting and straggler effect.
+
+Launching DeepSpeed model on multiple CPU nodes is similar to other accelerators.  We need to specify `impi` as launcher and specify `--bind_cores_to_rank` for better core binding.  Also specify `slots` number according to number of CPU sockets in   host file.
+
+```
+# hostfile content should follow the format
+# worker-1-hostname slots=<#sockets>
+# worker-2-hostname slots=<#sockets>
+# ...
+
+deepspeed --hostfile=<hostfile> --bind_cores_to_rank --launcher impi --master_addr <master-ip> <deepspeed-model-script>
+```
+
+## Install with Intel Extension for PyTorch and oneCCL
+Although not mandatory, Intel Extension for PyTorch and Intel oneCCL provide better optimizations for LLM models.  Intel oneCCL also provide optimization when running LLM model on multi-node.  To use DeepSpeed with Intel Extension for PyTorch and oneCCL, use the following steps:
+1. Install Intel Extension for PyTorch.  This is suggested if you want to get better LLM inference performance on CPU.
+`pip install intel-extension-for-pytorch`
+
+The following steps are to install oneCCL binding for PyTorch.  This is suggested if you are running DeepSpeed on multiple CPU node, for better communication performance.   On single node with multiple CPU socket, these steps are not needed.
+
+2. Install Intel oneCCL binding for PyTorch
+`python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu`
+
+3. Install Intel oneCCL, this will be used to build direct oneCCL kernels (CCLBackend kernels)
+```
+pip install oneccl-devel
+pip install impi-devel
+```
+Then set the environment variables for Intel oneCCL (assuming using conda environment).
+```
+export CPATH=${CONDA_PREFIX}/include:$CPATH
+export CCL_ROOT=${CONDA_PREFIX}
+export I_MPI_ROOT=${CONDA_PREFIX}
+export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/ccl/cpu:${CONDA_PREFIX}/lib/libfabric:${CONDA_PREFIX}/lib
+```
+
+## Optimize LLM inference with Intel Extension for PyTorch
+Intel Extension for PyTorch compatible with DeepSpeed AutoTP tensor parallel inference.  It allows CPU inference to benefit from both DeepSpeed Automatic Tensor Parallelism, and LLM optimizations of Intel Extension for PyTorch.  To use Intel Extension for PyTorch, after calling deepspeed.init_inference, call
+```
+ipex_model = ipex.llm.optimize(deepspeed_model)
+```
+to get model optimzied by Intel Extension for PyTorch.
+
+## More example for using DeepSpeed with Intel Extension for PyTorch on Intel Architecture CPU
+Refer to https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm for more extensive guide.
+
+# Intel XPU
+DeepSpeed XPU accelerator supports Intel® Data Center GPU Max Series.
+
+DeepSpeed has been verified on the following GPU products:
+* Intel® Data Center GPU Max 1100
+* Intel® Data Center GPU Max 1550
+
+## Installation steps for Intel XPU
+To install DeepSpeed on Intel XPU, use the following steps:
+1. Install oneAPI base toolkit \
+The Intel® oneAPI Base Toolkit (Base Kit) is a core set of tools and libraries, including an DPC++/C++ Compiler for building Deepspeed XPU kernels like fusedAdam and CPUAdam, high performance computation libraries demanded by IPEX, etc.
+For easy download, usage and more details, check [Intel oneAPI base-toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+2. Install PyTorch, Intel extension for pytorch, Intel oneCCL Bindings for PyTorch. These packages are required in `xpu_accelerator` for torch functionality and performance, also communication backend on Intel platform. The recommended installation reference:
+https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu.
+
+3. Install DeepSpeed \
+`pip install deepspeed`
+
+## How to use DeepSpeed on Intel XPU
+DeepSpeed can be launched on Intel XPU with deepspeed launch command. Before that, user needs activate the oneAPI environment by: \
+`source <oneAPI installed path>/setvars.sh`
+
+To validate the XPU availability and if the XPU accelerator is correctly chosen, here is an example:
+```
+$ python
+>>> import torch; print('torch:', torch.__version__)
+torch: 2.3.0
+>>> import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())
+XPU available: True
+>>> from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)
+accelerator: xpu
+```
+
+## More example for using DeepSpeed on Intel XPU
+Refer to https://github.com/intel/intel-extension-for-pytorch/tree/release/xpu/2.1.40/examples/gpu/inference/python/llm for more extensive guide.
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
index aea98ad9b22a4..e1903ed058920 100644
--- a/docs/_tutorials/automatic-tensor-parallelism.md
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -126,6 +126,8 @@ The following model families have been successfully tested with automatic tensor
 - bigbird_pegasus
 - bloom
 - camembert
+- chatglm2
+- chatglm3
 - codegen
 - codellama
 - deberta_v2
@@ -144,6 +146,7 @@ The following model families have been successfully tested with automatic tensor
 - m2m_100
 - marian
 - mistral
+- mixtral
 - mpt
 - mvp
 - nezha
@@ -151,8 +154,10 @@ The following model families have been successfully tested with automatic tensor
 - opt
 - pegasus
 - perceiver
+- phi
 - plbart
 - qwen
+- qwen2
 - reformer
 - roberta
 - roformer
@@ -162,6 +167,7 @@ The following model families have been successfully tested with automatic tensor
 - xglm
 - xlm_roberta
 - yoso
+- yuan
 
 # Unsupported Models
 
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 932bb355cf26e..b1a8b53697610 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -75,6 +75,12 @@ Alternatively, the standard mpirun launcher can also be used as follows:
 mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
+#### 1.2.3 Compressed implementation
+
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
 ### 1.3 1-bit Algorithm
 
 The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
@@ -106,7 +112,7 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_
 
 `cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 #### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
 Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
diff --git a/docs/_tutorials/onebit-lamb.md b/docs/_tutorials/onebit-lamb.md
index 4873f1f35c170..b6c6ef0750367 100644
--- a/docs/_tutorials/onebit-lamb.md
+++ b/docs/_tutorials/onebit-lamb.md
@@ -61,6 +61,10 @@ Alternatively, the standard mpirun launcher can also be used as follows:
 mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
+#### 1.2.3 Compressed implementation
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
 ### 1.3 1-bit LAMB Algorithm
 
 The detailed description of the 1-bit LAMB algorithm can be seen from our [paper](https://arxiv.org/abs/2104.06069).
@@ -101,7 +105,7 @@ Please note the new parameters `freeze_step`, `cuda_aware`, `comm_backend_name`,
 
 `cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" or "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 `coeff_beta` is used when calculating a moving average of the LAMB scaling coefficient during the warmup stage. This moving average is then used as the frozen base scaling coefficient during the compression stage.
 
diff --git a/docs/_tutorials/zero-one-adam.md b/docs/_tutorials/zero-one-adam.md
index 2dd956e802fd7..055c685faf899 100644
--- a/docs/_tutorials/zero-one-adam.md
+++ b/docs/_tutorials/zero-one-adam.md
@@ -62,6 +62,10 @@ Alternatively, the standard mpirun launcher can also be used as follows:
 mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
+#### 1.2.3 Compressed implementation
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
 ### 1.3 0/1 Adam Algorithm
 
 The detailed description of the 0/1 Adam algorithm can be seen from our [paper](https://arxiv.org/abs/2202.06009).
@@ -107,7 +111,7 @@ The learning rate policy is the default policy used in 0/1 Adam, and the value o
 
 `cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" or "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 #### 1.4.1 Momentum masks for parameters with constant zero gradients
 Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 0/1 Adam we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
diff --git a/docs/index.md b/docs/index.md
index 992c8ef6adb5b..1efdcea132d2e 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,26 +7,26 @@ title: "Latest News"
 ---
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
+* [2024/08] [DeepSpeed on Windows](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/windows/08-2024/chinese/README.md)]
+
+* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)]
 * [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
 * [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
 * [2024/01] [DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19)
-* [2023/11] [Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed](https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference) [[Intel version]](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html)
-* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
-* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
-* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
-* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
-* [2023/08] [DeepSpeed ZeRO-Inference: 20x faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
 
 <!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
 
 <details>
  <summary>More news</summary>
  <ul>
-  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md">DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements</a></li>
 
-  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses">DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models</a> [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md">日本語</a>]</li>
+  <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/intel-inference/README.md">Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed</a> [<a href="https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html">Intel version</a>]</li>
+
+  <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp/README.md">DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow</a></li>
+
+  <li>[2023/11] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen">DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference</a> [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md">日本語</a>]</li>
+
 
-  <li>[2023/06] <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">ZeRO++: A leap in speed for LLM and chat model training with 4X less communication</a> [<a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">English</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md">日本語</a>]</li>
  </ul>
 </details>
 
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index b55c821910b9f..e7f16adbf2a3d 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -3,13 +3,14 @@
 
 # DeepSpeed Team
 
+import os
 import distutils.spawn
 import subprocess
 
-from .builder import OpBuilder
+from .builder import TorchCPUOpBuilder
 
 
-class AsyncIOBuilder(OpBuilder):
+class AsyncIOBuilder(TorchCPUOpBuilder):
     BUILD_VAR = "DS_BUILD_AIO"
     NAME = "async_io"
 
@@ -19,44 +20,54 @@ def __init__(self):
     def absolute_name(self):
         return f'deepspeed.ops.aio.{self.NAME}_op'
 
-    def sources(self):
-        return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
+    def lib_sources(self):
+        src_list = [
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp',
             'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
+        return src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp']
 
     def include_paths(self):
-        return ['csrc/aio/py_lib', 'csrc/aio/common']
+        import torch
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE
 
     def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-        import torch  # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed
+        args = super().cxx_args()
+        import torch
         TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
-        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
-            CPP_STD = '-std=c++17'
-        else:
-            CPP_STD = '-std=c++14'
-        return [
-            '-g',
-            '-Wall',
-            '-O0',
-            CPP_STD,
-            '-shared',
-            '-fPIC',
-            '-Wno-reorder',
-            CPU_ARCH,
-            '-fopenmp',
-            SIMD_WIDTH,
-            '-laio',
-        ]
+        if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1):
+            args.remove('-std=c++17')
+            args.append('-std=c++14')
+        args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
+        return args
 
     def extra_ldflags(self):
-        return ['-laio']
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
+        import torch.utils.cpp_extension
+        CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
+        CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
+        ldflags = [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart']
+        return ldflags
 
     def check_for_libaio_pkg(self):
         libs = dict(
@@ -79,13 +90,13 @@ def check_for_libaio_pkg(self):
                 break
         return found
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check for the existence of libaio by using distutils
         # to compile and link a test program that calls io_submit,
         # which is a function provided by libaio that is used in the async_io op.
         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
         # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
+        aio_compatible = self.has_function('io_submit', ('aio', ))
         if verbose and not aio_compatible:
             self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 03611bf562843..ca4b339e24477 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import os
+import re
 import sys
 import time
 import importlib
@@ -215,19 +216,31 @@ def installed_rocm_version():
 
         ROCM_MAJOR = '0'
         ROCM_MINOR = '0'
+        ROCM_VERSION_DEV_RAW = ""
         if OpBuilder.is_rocm_pytorch():
             from torch.utils.cpp_extension import ROCM_HOME
-            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version-dev")
+            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version")
             if rocm_ver_file.is_file():
                 with open(rocm_ver_file, 'r') as file:
                     ROCM_VERSION_DEV_RAW = file.read()
             elif "rocm" in torch.__version__:
                 ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
+            if ROCM_VERSION_DEV_RAW != "":
+                ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
+                ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
             else:
+                # Look in /usr/include/rocm-version.h
+                rocm_ver_file = Path("/usr/include/rocm_version.h")
+                if rocm_ver_file.is_file():
+                    with open(rocm_ver_file, 'r') as file:
+                        for ln in file.readlines():
+                            if "#define ROCM_VERSION_MAJOR" in ln:
+                                ROCM_MAJOR = re.findall(r'\S+', ln)[2]
+                            elif "#define ROCM_VERSION_MINOR" in ln:
+                                ROCM_MINOR = re.findall(r'\S+', ln)[2]
+            if ROCM_MAJOR == '0':
                 assert False, "Could not detect ROCm version"
-            assert ROCM_VERSION_DEV_RAW != "", "Could not detect ROCm version"
-            ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
-            ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
+
         OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
         return OpBuilder._rocm_version
 
@@ -235,7 +248,10 @@ def installed_rocm_version():
     def get_rocm_gpu_arch():
         if OpBuilder._rocm_gpu_arch:
             return OpBuilder._rocm_gpu_arch
-        rocm_gpu_arch_cmd = "/opt/rocm/bin/rocminfo | grep -o -m 1 'gfx.*'"
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
         try:
             result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
             rocm_gpu_arch = result.decode('utf-8').strip()
@@ -248,7 +264,12 @@ def get_rocm_gpu_arch():
     def get_rocm_wavefront_size():
         if OpBuilder._rocm_wavefront_size:
             return OpBuilder._rocm_wavefront_size
-        rocm_wavefront_size_cmd = "/opt/rocm/bin/rocminfo | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
+
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_wavefront_size_cmd = str(
+            rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
         try:
             result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
             rocm_wavefront_size = result.decode('utf-8').strip()
@@ -284,7 +305,7 @@ def is_compatible(self, verbose=True):
     def extra_ldflags(self):
         return []
 
-    def has_function(self, funcname, libraries, verbose=False):
+    def has_function(self, funcname, libraries, library_dirs=None, verbose=False):
         '''
         Test for existence of a function within a tuple of libraries.
 
@@ -340,7 +361,8 @@ def has_function(self, funcname, libraries, verbose=False):
             compiler.link_executable(objs,
                                      os.path.join(tempdir, 'a.out'),
                                      extra_preargs=self.strip_empty_entries(ldflags),
-                                     libraries=libraries)
+                                     libraries=libraries,
+                                     library_dirs=library_dirs)
 
             # Compile and link succeeded
             return True
@@ -749,11 +771,18 @@ def nvcc_args(self):
             except ValueError:
                 nvcc_threads = min(os.cpu_count(), 8)
 
-            cuda_major, _ = installed_cuda_version()
+            cuda_major, cuda_minor = installed_cuda_version()
+            if cuda_major > 10:
+                if cuda_major == 12 and cuda_minor >= 5:
+                    std_lib = '-std=c++20'
+                else:
+                    std_lib = '-std=c++17'
+            else:
+                std_lib = '-std=c++14'
             args += [
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
-                '-std=c++17' if cuda_major > 10 else '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__', f'--threads={nvcc_threads}'
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib,
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+                f'--threads={nvcc_threads}'
             ]
             if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
                 args.append('--ptxas-options=-v')
@@ -772,25 +801,32 @@ def libraries_args(self):
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
+    def get_cuda_lib64_path(self):
+        import torch
+        if not self.is_rocm_pytorch():
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+            if not os.path.exists(CUDA_LIB64):
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        return CUDA_LIB64
+
     def extra_ldflags(self):
         if self.build_for_cpu:
             return ['-fopenmp']
 
         if not self.is_rocm_pytorch():
-            return ['-lcurand']
+            ld_flags = ['-lcurand']
+            if not self.build_for_cpu:
+                ld_flags.append(f'-L{self.get_cuda_lib64_path()}')
+            return ld_flags
 
         return []
 
     def cxx_args(self):
-        import torch
         args = []
         if not self.build_for_cpu:
-            if not self.is_rocm_pytorch():
-                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
-                if not os.path.exists(CUDA_LIB64):
-                    CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
-            else:
-                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+            CUDA_LIB64 = self.get_cuda_lib64_path()
 
             args += super().cxx_args()
             args += [
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
index 6e7721f94e012..af3aa74297752 100644
--- a/op_builder/evoformer_attn.py
+++ b/op_builder/evoformer_attn.py
@@ -41,18 +41,21 @@ def nvcc_args(self):
         args.append(f"-DGPU_ARCH={major}{minor}")
         return args
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile kernels")
             return False
         if self.cutlass_path is None:
-            self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
+            if verbose:
+                self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
             return False
         with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
             if '3.1.0' not in f.read():
-                self.warning("Please use CUTLASS version >= 3.1.0")
+                if verbose:
+                    self.warning("Please use CUTLASS version >= 3.1.0")
                 return False
         cuda_okay = True
         if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
@@ -60,10 +63,12 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 7:
-                self.warning("Please use a GPU with compute capability >= 7.0")
+                if verbose:
+                    self.warning("Please use a GPU with compute capability >= 7.0")
                 cuda_okay = False
             if torch_cuda_major < 11 or sys_cuda_major < 11:
-                self.warning("Please use CUDA 11+")
+                if verbose:
+                    self.warning("Please use CUDA 11+")
                 cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/fp_quantizer.py b/op_builder/fp_quantizer.py
index 9f07ec3d1275e..40cf504c2c834 100644
--- a/op_builder/fp_quantizer.py
+++ b/op_builder/fp_quantizer.py
@@ -3,6 +3,11 @@
 
 # DeepSpeed Team
 
+try:
+    from packaging import version as pkg_version
+except ImportError:
+    pkg_version = None
+
 from .builder import CUDAOpBuilder, installed_cuda_version
 
 
@@ -17,11 +22,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.ops.fp_quantizer.{self.NAME}_op'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -30,12 +36,39 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 8:
-                self.warning("NVIDIA Inference is only supported on Ampere and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Ampere and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
+
+        try:
+            import triton
+        except ImportError:
+            if verbose:
+                self.warning(f"please install triton==2.3.0 or 2.3.1 if you want to use the FP Quantizer Kernels")
+            return False
+
+        # triton 2.3.0 and 2.3.1 are okay and the only versions released in 2.3.x before 3.x was released
+        if pkg_version:
+            allowed = pkg_version.parse("2.3")
+            installed_triton = pkg_version.parse(triton.__version__)
+            triton_mismatch = installed_triton.major != allowed.major or installed_triton.minor != allowed.minor
+        else:
+            installed_triton = triton.__version__
+            major, minor, _ = installed_triton.split(".")
+            triton_mismatch = major != "2" or minor != "3"
+
+        if triton_mismatch:
+            if verbose:
+                self.warning(
+                    f"FP Quantizer is using an untested triton version ({installed_triton}), only 2.3.0 and 2.3.1 are known to be compatible with these kernels"
+                )
+            return False
+
         return super().is_compatible(verbose) and cuda_okay
 
     def filter_ccs(self, ccs):
diff --git a/op_builder/gds.py b/op_builder/gds.py
new file mode 100644
index 0000000000000..e024674e01d8e
--- /dev/null
+++ b/op_builder/gds.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+from .async_io import AsyncIOBuilder
+
+
+class GDSBuilder(AsyncIOBuilder):
+    BUILD_VAR = "DS_BUILD_GDS"
+    NAME = "gds"
+
+    def __init__(self):
+        super().__init__()
+
+    def absolute_name(self):
+        return f'deepspeed.ops.gds.{self.NAME}_op'
+
+    def lib_sources(self):
+        src_list = ['csrc/gds/py_lib/deepspeed_py_gds_handle.cpp', 'csrc/gds/py_lib/deepspeed_gds_op.cpp']
+        return super().lib_sources() + src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/gds/py_lib/py_ds_gds.cpp']
+
+    def cxx_args(self):
+        return super().cxx_args() + ['-lcufile']
+
+    def include_paths(self):
+        import torch
+        CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE
+
+    def extra_ldflags(self):
+        return super().extra_ldflags() + ['-lcufile']
+
+    def is_compatible(self, verbose=False):
+        import torch.utils.cpp_extension
+        CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
+        CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
+        gds_compatible = self.has_function(funcname="cuFileDriverOpen",
+                                           libraries=("cufile", ),
+                                           library_dirs=(
+                                               CUDA_HOME,
+                                               CUDA_LIB64,
+                                           ),
+                                           verbose=verbose)
+
+        return gds_compatible and super().is_compatible(verbose)
diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py
index d1957f39d9a86..45e8628e669fc 100755
--- a/op_builder/inference_core_ops.py
+++ b/op_builder/inference_core_ops.py
@@ -23,7 +23,8 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -32,11 +33,13 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/inference_cutlass_builder.py b/op_builder/inference_cutlass_builder.py
index 51f7931d94352..fda6e74bbf6a9 100644
--- a/op_builder/inference_cutlass_builder.py
+++ b/op_builder/inference_cutlass_builder.py
@@ -22,7 +22,8 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -31,11 +32,13 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/npu/fused_adam.py b/op_builder/npu/fused_adam.py
index fc1bc83c7cc7c..d32103db70556 100644
--- a/op_builder/npu/fused_adam.py
+++ b/op_builder/npu/fused_adam.py
@@ -16,8 +16,8 @@ class NPUFusedAdam:
     @staticmethod
     def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
                           bias_correction, weight_decay, *args):
-        bias_correction1 = beta1**step
-        bias_correction2 = beta2**step
+        bias_correction1 = beta1**(step - 1)
+        bias_correction2 = beta2**(step - 1)
 
         # iteration group['params']
         for i in range(len(tensor_lists[0])):
diff --git a/op_builder/ragged_ops.py b/op_builder/ragged_ops.py
index ec7cab91885f0..a4e365786a2be 100644
--- a/op_builder/ragged_ops.py
+++ b/op_builder/ragged_ops.py
@@ -23,7 +23,8 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -32,11 +33,13 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/ragged_utils.py b/op_builder/ragged_utils.py
index 89450e1fd30d9..a855f072af8c7 100755
--- a/op_builder/ragged_utils.py
+++ b/op_builder/ragged_utils.py
@@ -23,7 +23,8 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -32,11 +33,13 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
index 188d257ff4ef2..2385adc8fe9cc 100644
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
@@ -27,45 +27,51 @@ def sources(self):
     def cxx_args(self):
         return ['-O2', '-fopenmp']
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check to see if llvm and cmake are installed since they are dependencies
         #required_commands = ['llvm-config|llvm-config-9', 'cmake']
         #command_status = list(map(self.command_exists, required_commands))
         #deps_compatible = all(command_status)
 
         if self.is_rocm_pytorch():
-            self.warning(f'{self.NAME} is not compatible with ROCM')
+            if verbose:
+                self.warning(f'{self.NAME} is not compatible with ROCM')
             return False
 
         try:
             import torch
         except ImportError:
-            self.warning(f"unable to import torch, please install it first")
+            if verbose:
+                self.warning(f"unable to import torch, please install it first")
             return False
 
         # torch-cpu will not have a cuda version
         if torch.version.cuda is None:
             cuda_compatible = False
-            self.warning(f"{self.NAME} cuda is not available from torch")
+            if verbose:
+                self.warning(f"{self.NAME} cuda is not available from torch")
         else:
             major, minor = torch.version.cuda.split('.')[:2]
             cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11)
             if not cuda_compatible:
-                self.warning(f"{self.NAME} requires CUDA version 10.1+")
+                if verbose:
+                    self.warning(f"{self.NAME} requires CUDA version 10.1+")
 
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
         torch_compatible = (TORCH_MAJOR == 1 and TORCH_MINOR >= 5)
         if not torch_compatible:
-            self.warning(
-                f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
+            if verbose:
+                self.warning(
+                    f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
 
         try:
             import triton
         except ImportError:
             # auto-install of triton is broken on some systems, reverting to manual install for now
             # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
-            self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
+            if verbose:
+                self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
             return False
 
         if pkg_version:
@@ -76,7 +82,9 @@ def is_compatible(self, verbose=True):
             triton_mismatch = installed_triton != "1.0.0"
 
         if triton_mismatch:
-            self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
+            if verbose:
+                self.warning(
+                    f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
             return False
 
         return super().is_compatible(verbose) and torch_compatible and cuda_compatible
diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py
index 59caf57f938db..8a6b36cce0b08 100644
--- a/op_builder/spatial_inference.py
+++ b/op_builder/spatial_inference.py
@@ -21,7 +21,8 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -31,7 +32,8 @@ def is_compatible(self, verbose=True):
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
index 5ee902289448c..88b77499cc0e9 100755
--- a/op_builder/transformer_inference.py
+++ b/op_builder/transformer_inference.py
@@ -21,7 +21,8 @@ def is_compatible(self, verbose=True):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -30,11 +31,13 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index c0fc5dba9d33e..2e2c880fbeb0d 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,5 +1,5 @@
 accelerate
-clang-format==16.0.2
+clang-format==18.1.3
 comet_ml>=3.41.0
 deepspeed-kernels ; sys_platform == 'linux'
 docutils<0.18
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 05f88337f3a95..6840d6dbcc98c 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,7 +1,6 @@
 hjson
 ninja
 numpy
-nvidia-ml-py
 packaging>=20.0
 psutil
 py-cpuinfo
diff --git a/setup.py b/setup.py
index 183d42907e829..8707209526ad1 100755
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,8 @@
 from op_builder.all_ops import ALL_OPS, accelerator_name
 from op_builder.builder import installed_cuda_version
 
+from accelerator import get_accelerator
+
 # Fetch rocm state.
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
 rocm_version = OpBuilder.installed_rocm_version()
@@ -90,8 +92,12 @@ def get_env_if_set(key, default: typing.Any = ""):
     'triton': fetch_requirements('requirements/requirements-triton.txt'),
 }
 
+# Only install pynvml on nvidia gpus.
+if torch_available and get_accelerator().device_name() == 'cuda' and not is_rocm_pytorch:
+    install_requires.append('nvidia-ml-py')
+
 # Add specific cupy version to both onebit extension variants.
-if torch_available and torch.cuda.is_available():
+if torch_available and get_accelerator().device_name() == 'cuda':
     cupy = None
     if is_rocm_pytorch:
         rocm_major, rocm_minor = rocm_version
@@ -120,7 +126,6 @@ def get_env_if_set(key, default: typing.Any = ""):
 
 # For any pre-installed ops force disable ninja.
 if torch_available:
-    from accelerator import get_accelerator
     use_ninja = is_env_set("DS_ENABLE_NINJA")
     cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja)
 
@@ -131,7 +136,7 @@ def get_env_if_set(key, default: typing.Any = ""):
     TORCH_MAJOR = "0"
     TORCH_MINOR = "0"
 
-if torch_available and not torch.cuda.is_available():
+if torch_available and not get_accelerator().device_name() == 'cuda':
     # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
     print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
           "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index 25256d376eeb5..dfab28aa7477c 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -14,6 +14,7 @@
 from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
+from .util import no_child_process_in_deepspeed_io
 
 
 class AlexNet(nn.Module):
@@ -125,22 +126,11 @@ def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True,
         trainset = cifar_trainset(fp16=fp16)
         config['local_rank'] = dist.get_rank()
 
-        # deepspeed_io defaults to creating a dataloader that uses a
-        # multiprocessing pool. Our tests use pools and we cannot nest pools in
-        # python. Therefore we're injecting this kwarg to ensure that no pools
-        # are used in the dataloader.
-        old_method = deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io
-
-        def new_method(*args, **kwargs):
-            kwargs["num_local_io_workers"] = 0
-            return old_method(*args, **kwargs)
-
-        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = new_method
-
-        engine, _, _, _ = deepspeed.initialize(config=config,
-                                               model=model,
-                                               model_parameters=[p for p in model.parameters()],
-                                               training_data=trainset)
+        with no_child_process_in_deepspeed_io():
+            engine, _, _, _ = deepspeed.initialize(config=config,
+                                                   model=model,
+                                                   model_parameters=[p for p in model.parameters()],
+                                                   training_data=trainset)
 
         losses = []
         for step in range(num_steps):
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index d512266c8a6f6..eadf670d93283 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -77,7 +77,7 @@
 
 @dataclass
 class ModelInfo:
-    modelId: str
+    id: str
     pipeline_tag: str
     tags: List[str]
 
@@ -94,7 +94,10 @@ def _hf_model_list() -> List[ModelInfo]:
     model_data = {"cache_time": 0, "model_list": []}
     if os.path.isfile(cache_file_path):
         with open(cache_file_path, 'rb') as f:
-            model_data = pickle.load(f)
+            try:
+                model_data = pickle.load(f)
+            except Exception as e:
+                print(f"Error loading cache file {cache_file_path}: {e}")
 
     current_time = time.time()
 
@@ -108,7 +111,7 @@ def _hf_model_list() -> List[ModelInfo]:
                 for model in _test_models:
                     model_list.extend(api.list_models(model_name=model))
                 model_data["model_list"] = [
-                    ModelInfo(modelId=m.modelId, pipeline_tag=m.pipeline_tag, tags=m.tags) for m in model_list
+                    ModelInfo(id=m.id, pipeline_tag=m.pipeline_tag, tags=m.tags) for m in model_list
                 ]
                 break  # Exit the loop if the operation is successful
             except requests.exceptions.HTTPError as e:
@@ -129,8 +132,8 @@ def _hf_model_list() -> List[ModelInfo]:
 
 # Get a list of all models and mapping from task to supported models
 _hf_models = _hf_model_list()
-_hf_model_names = [m.modelId for m in _hf_models]
-_hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
+_hf_model_names = [m.id for m in _hf_models]
+_hf_task_to_models = {task: [m.id for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
 
 # Get all combinations of task:model to test
 _model_w_tasks = [(m, t) for m, t in itertools.product(*[_test_models, _test_tasks]) if m in _hf_task_to_models[t]]
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
index 5f1ef930952c3..5a99422ba9ff8 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
@@ -13,7 +13,7 @@
 
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(1, 0), (17, 0), (33, 8), (63, 1)])
-@pytest.mark.parametrize("head_size", [64, 80, 128])
+@pytest.mark.parametrize("head_size", [64, 80, 96, 128])
 def test_single_sequence_single_block(n_tokens: int, history_size: int, head_size: int):
     """
     Validate that the copy works correctly
@@ -46,7 +46,7 @@ def test_single_sequence_single_block(n_tokens: int, history_size: int, head_siz
 
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(128, 0), (177, 0), (169, 8), (117, 88)])
-@pytest.mark.parametrize("head_size", [64, 80, 128])
+@pytest.mark.parametrize("head_size", [64, 80, 96, 128])
 def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, head_size: int):
     """
     Validate that the copy works correctly
@@ -78,7 +78,7 @@ def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, head_
 
 
 @pytest.mark.inference_v2_ops
-@pytest.mark.parametrize("head_size", [64, 80, 128])
+@pytest.mark.parametrize("head_size", [64, 80, 96, 128])
 def test_multi_sequence(head_size: int) -> None:
     n_heads_q = 16
     n_heads_kv = 16
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
index 156be9929d92e..33dd0a4c27005 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
@@ -84,7 +84,7 @@ def rotate_half(x: torch.Tensor) -> torch.Tensor:
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(1, 0), (17, 0), (33, 15), (1, 63)])
 @pytest.mark.parametrize("trained_emb", [False, True])
-@pytest.mark.parametrize("head_size", [64, 80])
+@pytest.mark.parametrize("head_size", [64, 80, 96])
 def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_emb: bool, head_size: int):
     """
     Validate that the copy works correctly
@@ -128,7 +128,7 @@ def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(128, 0), (177, 0), (169, 8), (117, 88)])
 @pytest.mark.parametrize("trained_emb", [False, True])
-@pytest.mark.parametrize("head_size", [64, 80])
+@pytest.mark.parametrize("head_size", [64, 80, 96])
 def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, trained_emb: bool, head_size: int):
     """
     Validate that the copy works correctly
@@ -171,7 +171,7 @@ def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, train
 
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("trained_emb", [False, True])
-@pytest.mark.parametrize("head_size", [64, 80])
+@pytest.mark.parametrize("head_size", [64, 80, 96])
 def test_multi_sequences(trained_emb: bool, head_size: int) -> None:
     n_heads_q = 16
     n_heads_kv = 16
@@ -216,9 +216,9 @@ def test_multi_sequences(trained_emb: bool, head_size: int) -> None:
 
 
 @pytest.mark.inference_v2_ops
-def test_rotary_dim() -> None:
+@pytest.mark.parametrize("head_size", [80, 96])
+def test_rotary_dim(head_size: int) -> None:
     trained_emb = False
-    head_size = 80
     rotary_dim = 64
     n_heads_q = 16
     n_heads_kv = 16
diff --git a/tests/unit/linear/test_ctx.py b/tests/unit/linear/test_ctx.py
new file mode 100644
index 0000000000000..e03d13fd6ce29
--- /dev/null
+++ b/tests/unit/linear/test_ctx.py
@@ -0,0 +1,106 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed
+import pytest
+from unit.common import DistributedTest
+
+import deepspeed.comm as dist
+from deepspeed.linear import LoRAConfig, init_lora
+from deepspeed.linear.optimized_linear import LoRAOptimizedLinear
+from unit.simple_model import random_dataloader, SimpleModel
+
+try:
+    import transformers
+except ImportError:
+    transformers = None
+
+if transformers is None:
+    pytest.skip("transformers is required for this test", allow_module_level=True)
+
+
+def injection_assert(model):
+    # pick out random linear that should have been replaced and initialized
+    q_proj = model.model.layers[1].self_attn.q_proj
+
+    assert isinstance(q_proj, LoRAOptimizedLinear), "injection did not happen"
+    assert q_proj._initialized, "lora was not initialized properly"
+    assert isinstance(q_proj.lora_weight_1, torch.nn.Linear)
+    assert isinstance(q_proj.lora_weight_2, torch.nn.Linear)
+
+
+class TestEngine(DistributedTest):
+    world_size = 2
+
+    def test_model(self):
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=2)
+        quant_config = None
+        hidden_dim = 64
+        nlayers = 4
+
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers)
+
+        init_lora(model)
+
+        model_norms = [model.linears[i].weight.norm().item() for i in range(nlayers)]
+
+        ds_config = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "bf16": {
+                "enabled": True
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": 1
+            }
+        }
+        model, *_ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
+
+        engine_norms = [model.module.linears[i].weight.norm().item() for i in range(nlayers)]
+
+        # Ensure that sharded weights are not broadcast during engine init
+        assert engine_norms == model_norms, f"{dist.get_rank()=} base weight norms are not the same after engine init, {engine_norms=} != {model_norms=}"
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestInitTransformers(DistributedTest):
+    world_size = 2
+
+    def test_pretrained_init(self):
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=2)
+        quant_config = None
+
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = transformers.AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-Llama-3")
+
+        injection_assert(model)
+
+    def test_config_init(self):
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=2)
+        quant_config = None
+
+        config = transformers.AutoConfig.from_pretrained("llamafactory/tiny-random-Llama-3")
+
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = transformers.AutoModelForCausalLM.from_config(config)
+
+        injection_assert(model)
diff --git a/tests/unit/linear/test_quant_param.py b/tests/unit/linear/test_quant_param.py
index 9479b3cba8a04..84a9f766ef741 100644
--- a/tests/unit/linear/test_quant_param.py
+++ b/tests/unit/linear/test_quant_param.py
@@ -42,7 +42,7 @@ def test_move_to_accelerator(self):
         assert qp.device == torch.device('cpu')
         qp = qp.to(get_accelerator().current_device_name())
         assert qp.device == torch.device(device)
-        assert qp.dtype == torch.int8
+        assert qp.dtype == torch.uint8
 
     def test_hf_clone(self):
         device = get_accelerator().current_device_name()
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index fdff9430a4e6a..f65d5e2a03bca 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -11,7 +11,7 @@
 from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader
 import deepspeed.comm as dist
 from deepspeed import get_accelerator
-from deepspeed.moe.sharded_moe import top1gating
+from deepspeed.moe.sharded_moe import top1gating, topkgating
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param
 from deepspeed.utils.torch import required_torch_version
 
@@ -191,3 +191,50 @@ def test(self):
                             drop_tokens=False,
                             use_rts=True,
                             use_tutel=False)
+
+
+class TestTopkGate(DistributedTest):
+
+    def test(self):
+
+        def check_equal(logits, cap, sparse_truth, res):
+            m, n = logits.shape
+            dispatch_mask_truth = torch.zeros(m, n, cap)
+            i, j, k = sparse_truth.t()
+            dispatch_mask_truth[i, j, k] = 1
+            assert (torch.equal(dispatch_mask_truth, res))
+
+        #s=4   e=4  topk=2   cap=2(s*topk/e)
+        logits = torch.tensor([[0.11, 0.2, 0.1, 0.3], [0.3, 0.4, 0.11, 0.1], [0.11, 0.1, 0.6, 0.5],
+                               [0.1, 0.11, 0.7, 0.8]])
+        logits *= dist.get_rank() + 1
+        probs_dispatch_res = topkgating(logits, 2, 1, min_capacity=1, drop_policy='probs')[2]
+        probs_sec_sparse = torch.tensor([[0, 1, 0], [1, 0, 0], [1, 1, 1], [2, 2, 0], [2, 3, 0], [3, 2, 1], [3, 3, 1]])
+        check_equal(logits, 2, probs_sec_sparse, probs_dispatch_res)
+
+        position_sec_sparse = torch.tensor([[0, 1, 0], [0, 3, 0], [1, 0, 0], [1, 1, 1], [2, 2, 0], [2, 3, 1],
+                                            [3, 2, 1]])
+        position_dispatch_res = topkgating(logits, 2, 1, min_capacity=1, drop_policy='position')[2]
+        check_equal(logits, 2, position_sec_sparse, position_dispatch_res)
+
+        #s=4   e=6  topk=3   cap=2(s*topk/e)
+        logits2 = torch.tensor([[0.5858, 0.4801, 0.6269, 0.5397, 0.9722, 0.7034],
+                                [0.5445, 0.6332, 0.4519, 0.6308, 0.0519, 0.6450],
+                                [0.4874, 0.8110, 0.7467, 0.8474, 0.0277, 0.3068],
+                                [0.8570, 0.6714, 0.5310, 0.3274, 0.4836, 0.9892]])
+        logits2 *= dist.get_rank() + 1
+
+        #top3 full mask     #prob_mask          #postion_mask
+        #0 0 1 0 1 1        #0 0 1 0 1 1        #0 0 1 0 1 1
+        #0 1 0 1 0 1        #0 0 0 1 0 0        #0 1 0 1 0 1
+        #0 1 1 1 0 0        #0 1 1 1 0 0        #0 1 1 1 0 0
+        #1 1 0 0 0 1        #1 1 0 0 0 1        #1 0 0 0 0 0
+        probs_dispatch_res = topkgating(logits2, 3, 1, min_capacity=1, drop_policy='probs')[2]
+        probs_sec_sparse = torch.tensor([[0, 2, 0], [0, 4, 0], [0, 5, 0], [1, 3, 0], [2, 1, 0], [2, 2, 1], [2, 3, 1],
+                                         [3, 0, 0], [3, 1, 1], [3, 5, 1]])
+        check_equal(logits2, 2, probs_sec_sparse, probs_dispatch_res)
+
+        position_sec_sparse = torch.tensor([[0, 2, 0], [0, 4, 0], [0, 5, 0], [1, 1, 0], [1, 3, 0], [1, 5, 1],
+                                            [2, 1, 1], [2, 2, 1], [2, 3, 1], [3, 0, 0]])
+        position_dispatch_res = topkgating(logits2, 3, 1, min_capacity=1, drop_policy='position')[2]
+        check_equal(logits2, 2, position_sec_sparse, position_dispatch_res)
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index f6d175ce67bca..e6927efc38248 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -78,7 +78,7 @@ def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestRead(DistributedTest):
@@ -144,7 +144,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
             h.free_cpu_locked_tensor(aio_buffer)
 
 
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
 @pytest.mark.parametrize("single_submit", [True, False])
 @pytest.mark.parametrize("overlap_events", [True, False])
 class TestWrite(DistributedTest):
@@ -213,7 +213,7 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
 
 @pytest.mark.sequential
-@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True])  # TODO: aio_handle pinned tensor API is broken
 @pytest.mark.parametrize("cuda_device", [True, False])
 class TestAsyncQueue(DistributedTest):
     world_size = 1
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
new file mode 100644
index 0000000000000..53655994b5601
--- /dev/null
+++ b/tests/unit/ops/aio/test_gds.py
@@ -0,0 +1,270 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import filecmp
+import torch
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import GDSBuilder
+from unit.common import DistributedTest
+
+KILO_BYTE = 1024 * 256
+BLOCK_SIZE = KILO_BYTE
+QUEUE_DEPTH = 2
+IO_SIZE = 4 * BLOCK_SIZE
+IO_PARALLEL = 2
+
+if not deepspeed.ops.__compatible_ops__[GDSBuilder.NAME]:
+    pytest.skip('Skip tests since gds is not compatible', allow_module_level=True)
+
+
+def _get_local_rank():
+    if get_accelerator().is_available():
+        return dist.get_rank()
+    return 0
+
+
+def _do_ref_write(tmpdir, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
+    ref_buffer = os.urandom(IO_SIZE)
+    with open(ref_file, 'wb') as f:
+        f.write(ref_buffer)
+
+    return ref_file, ref_buffer
+
+
+def _get_test_write_file(tmpdir, index):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'_gds_write_random_{file_suffix}.pt')
+
+
+def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index=0):
+    test_file = _get_test_write_file(tmpdir, index)
+    test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
+    gds_handle.pin_device_tensor(test_buffer)
+    return test_file, test_buffer
+
+
+def _validate_handle_state(handle, single_submit, overlap_events):
+    assert handle.get_single_submit() == single_submit
+    assert handle.get_overlap_events() == overlap_events
+    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_block_size() == BLOCK_SIZE
+    assert handle.get_queue_depth() == QUEUE_DEPTH
+
+
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestRead(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_read(self, tmpdir, single_submit, overlap_events):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.sync_pread(gds_buffer, ref_file)
+        assert read_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == gds_buffer.tolist()
+
+        h.unpin_device_tensor(gds_buffer)
+
+    def test_async_read(self, tmpdir, single_submit, overlap_events):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.async_pread(gds_buffer, ref_file)
+        assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == gds_buffer.tolist()
+
+        h.unpin_device_tensor(gds_buffer)
+
+
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestWrite(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_write(self, tmpdir, single_submit, overlap_events):
+
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.sync_pwrite(gds_buffer, gds_file)
+        assert write_status == 1
+
+        h.unpin_device_tensor(gds_buffer)
+
+        assert os.path.isfile(gds_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, gds_file, shallow=False)
+
+    def test_async_write(self, tmpdir, single_submit, overlap_events):
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.async_pwrite(gds_buffer, gds_file)
+        assert write_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        h.unpin_device_tensor(gds_buffer)
+
+        assert os.path.isfile(gds_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, gds_file, shallow=False)
+
+
+@pytest.mark.sequential
+class TestAsyncQueue(DistributedTest):
+    world_size = 1
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_read(self, tmpdir, async_queue):
+
+        ref_files = []
+        for i in range(async_queue):
+            f, _ = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+
+        single_submit = True
+        overlap_events = True
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffers = [
+            torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue)
+        ]
+        for buf in gds_buffers:
+            h.pin_device_tensor(buf)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pread(gds_buffers[i], ref_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for i in range(async_queue):
+            with open(ref_files[i], 'rb') as f:
+                ref_buffer = list(f.read())
+            assert ref_buffer == gds_buffers[i].tolist()
+
+        for t in gds_buffers:
+            h.unpin_device_tensor(t)
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_write(self, tmpdir, async_queue):
+        ref_files = []
+        ref_buffers = []
+        for i in range(async_queue):
+            f, buf = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+            ref_buffers.append(buf)
+
+        single_submit = True
+        overlap_events = True
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_files = []
+        gds_buffers = []
+        for i in range(async_queue):
+            f, buf = _get_test_write_file_and_device_buffer(tmpdir, ref_buffers[i], h, i)
+            gds_files.append(f)
+            gds_buffers.append(buf)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pwrite(gds_buffers[i], gds_files[i])
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for t in gds_buffers:
+            h.unpin_device_tensor(t)
+
+        for i in range(async_queue):
+            assert os.path.isfile(gds_files[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_files[i], gds_files[i], shallow=False)
+
+
+@pytest.mark.parametrize("use_new_api", [True, False])
+class TestLockDeviceTensor(DistributedTest):
+    world_size = 2
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_pin_device_tensor(self, use_new_api):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        unpinned_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        if use_new_api:
+            pinned_buffer = h.new_pinned_device_tensor(unpinned_buffer.numel(), unpinned_buffer)
+        else:
+            pinned_buffer = torch.empty_like(unpinned_buffer)
+            h.pin_device_tensor(pinned_buffer)
+
+        assert unpinned_buffer.device == pinned_buffer.device
+        assert unpinned_buffer.dtype == pinned_buffer.dtype
+        assert unpinned_buffer.numel() == pinned_buffer.numel()
+
+        if use_new_api:
+            h.free_pinned_device_tensor(pinned_buffer)
+        else:
+            h.unpin_device_tensor(pinned_buffer)
diff --git a/tests/unit/ops/fp_quantizer/test_fp8_gemm.py b/tests/unit/ops/fp_quantizer/test_fp8_gemm.py
new file mode 100644
index 0000000000000..d66f7c8cb4cc4
--- /dev/null
+++ b/tests/unit/ops/fp_quantizer/test_fp8_gemm.py
@@ -0,0 +1,45 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+from deepspeed.ops.fp_quantizer import FP_Quantize, matmul_fp8
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+@pytest.mark.parametrize("q_bits", [8], ids=[
+    "qbits8",
+])
+@pytest.mark.parametrize("M", [1, 2, 4, 8, 32, 64, 128, 256, 512, 1024, 2048])
+def test_fp_quant(dtype, q_bits, M):
+    quantization_group_size = 128
+    fpq = FP_Quantize(group_size=quantization_group_size)
+
+    N = 8192
+    H = 4096
+
+    x = torch.randn(M, H, dtype=dtype, device='cuda')
+    weight_bf16 = torch.randn(H, N, dtype=dtype, device='cuda')
+
+    weight, _ = fpq.quantize(weight_bf16.data, q_bits=8, return_meta_tensor=True)
+    scale = fpq.get_scales()
+    out = matmul_fp8(
+        x,
+        weight,
+        scale,
+        quantization_group_size,
+    )
+
+    out_q = torch.matmul(x, fpq.dequantize(weight, scale=fpq.scale))
+
+    error = ((out - out_q).abs() / (out.abs() + 1e-5)).sum() / out.numel()
+    assert 0.004 > error, f"failed on batch-size {M} with error {error}"
diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py
index 88e26290b6505..f198762c5fcc7 100644
--- a/tests/unit/runtime/pipe/test_pipe.py
+++ b/tests/unit/runtime/pipe/test_pipe.py
@@ -7,12 +7,15 @@
 import torch.nn as nn
 import pytest
 
+import torch
+
+import deepspeed
 import deepspeed.comm as dist
 from deepspeed.runtime.pipe.topology import PipeDataParallelTopology
 from deepspeed.runtime.pipe.module import PipelineModule
 from unit.alexnet_model import AlexNetPipe, train_cifar
 from unit.common import DistributedTest
-from unit.util import skip_on_arch
+from unit.util import skip_on_arch, no_child_process_in_deepspeed_io
 
 PipeTopo = PipeDataParallelTopology
 
@@ -155,3 +158,95 @@ def test_pipe_use_reentrant(self, topo_config):
         # the following check could passed on higher version docker: nvcr.io/nvidia/pytorch:23.07-py3(torch2.1.0 cuda12.1)
         # Check if models have same weights after training
         # self._check_model_params_equal(base_model, test_model)
+
+
+class DynamicShapeTestLayer(nn.Module):
+
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.shapes = set()
+
+    def forward(self, x):
+        self.shapes.add(x.shape)
+        y = self.fc(x)
+        return y
+
+
+class DynamicShapeTestModel(nn.Module):
+
+    def __init__(self, n_layers, hidden_size):
+        super().__init__()
+        self.layers = nn.ModuleList([DynamicShapeTestLayer(hidden_size) for _ in range(n_layers)])
+
+
+@pytest.mark.parametrize('topo_config', [
+    {
+        "num_pp": 1,
+        "num_dp": 4
+    },
+    {
+        "num_pp": 2,
+        "num_dp": 2
+    },
+    {
+        "num_pp": 4,
+        "num_dp": 1
+    },
+])
+class TestPipeDynamicShape(DistributedTest):
+    world_size = 4
+
+    def test_pipe_base(self, topo_config):
+        """This test checks if the pipeline engine can handle dynamic shapes correctly.
+        We pass inputs of different shapes to the pipeline engine.
+        """
+
+        n_iter = 10
+        n_layers = 4
+        n_samples = 1024
+        batch_size = 4
+        channel_dims = [8, 16, 32, 64]
+        hidden_size = 16
+
+        topo = PipeTopo(**topo_config)
+
+        model = DynamicShapeTestModel(n_layers, hidden_size)
+        model = PipelineModule(layers=model.layers, topology=topo, loss_fn=nn.MSELoss(), dynamic_shape=True)
+
+        # Each batch has different channel dim but we use the same channel dim in the same batch
+        xs = [
+            torch.randn(channel_dims[(i // batch_size) % len(channel_dims)], hidden_size, dtype=torch.float32)
+            for i in range(n_samples)
+        ]
+        ys = [torch.randn_like(x) for x in xs]
+
+        class CustomDataset(torch.utils.data.Dataset):
+
+            def __init__(self, xs, ys):
+                self.xs = xs
+                self.ys = ys
+
+            def __len__(self):
+                return len(self.xs)
+
+            def __getitem__(self, idx):
+                return self.xs[idx], self.ys[idx]
+
+        dataset = CustomDataset(xs, ys)
+
+        config_dict["train_batch_size"] = batch_size
+
+        with no_child_process_in_deepspeed_io():
+            engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                   model=model,
+                                                   model_parameters=[p for p in model.parameters()],
+                                                   training_data=dataset)
+
+        for _ in range(n_iter):
+            _ = engine.train_batch()
+
+        # Check if all layers have seen different shapes
+        for layer in model.modules():
+            if isinstance(layer, DynamicShapeTestLayer):
+                assert len(layer.shapes) > 1
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
index 9ff99f169f7a3..a30f81cedde9f 100644
--- a/tests/unit/runtime/test_ds_initialize.py
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -305,3 +305,132 @@ def _lr_scheduler_callable(optimizer) -> _LRScheduler:
                 assert ds_lr_scheduler == client_scheduler
             else:
                 assert isinstance(ds_lr_scheduler, LambdaLR)
+
+
+@pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable])
+class TestClientLrSchedulerInit(DistributedTest):
+    world_size = 1
+
+    def test_same_lrscheler_and_callable(self, scheduler_type):
+        """
+        Expect behavior
+
+        if lr scheduler is defined in code and passed into initialize as arg,
+        it will be used even this is a lr scheduler has been defined in config.
+
+        Initialize lr scheduler from config when no lr scheduler is defined in code.
+        """
+
+        def _my_lambda(epoch):
+            return epoch // 10
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return LambdaLR(optimizer, _my_lambda)
+
+        config_dict = {'train_batch_size': 1}
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
+            client_scheduler = None
+        elif scheduler_type == _LRScheduler:
+            client_scheduler = LambdaLR(client_optimizer, _my_lambda)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                        model=model,
+                                                        model_parameters=list(model.parameters()),
+                                                        optimizer=client_optimizer,
+                                                        lr_scheduler=client_scheduler)
+        if scheduler_type is None:
+            # in this case, we initialize from config
+            assert not isinstance(ds_lr_scheduler, LambdaLR)
+            assert isinstance(ds_lr_scheduler, WarmupLR)
+        else:
+            # in this case, we initialize from passed-in scheduler
+            assert isinstance(ds_lr_scheduler, LambdaLR)
+            assert not isinstance(ds_lr_scheduler, WarmupLR)
+
+    def test_diff_lrscheler_and_callable(self, scheduler_type):
+        """
+        In this test,
+        the LambdaLR will be used for lrscheduler type
+        and the StepLR will be used for callable type
+        """
+
+        from torch.optim.lr_scheduler import StepLR
+
+        def _my_lambda(epoch):
+            return epoch // 10
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return StepLR(optimizer, step_size=30)
+
+        config_dict = {'train_batch_size': 1}
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
+            client_scheduler = None
+        elif scheduler_type == _LRScheduler:
+            client_scheduler = LambdaLR(client_optimizer, _my_lambda)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                        model=model,
+                                                        model_parameters=list(model.parameters()),
+                                                        optimizer=client_optimizer,
+                                                        lr_scheduler=client_scheduler)
+        if scheduler_type is None:
+            assert isinstance(ds_lr_scheduler, WarmupLR)
+        elif scheduler_type == _LRScheduler:
+            assert isinstance(ds_lr_scheduler, LambdaLR)
+        else:
+            # callable
+            assert isinstance(ds_lr_scheduler, StepLR)
+
+    def test_diff_lrscheler_and_callable_onecyclelr_steplr(self, scheduler_type):
+
+        from deepspeed.runtime.lr_schedules import OneCycle, ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR
+        from torch.optim.lr_scheduler import OneCycleLR, StepLR
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return OneCycleLR(optimizer, max_lr=0.01, total_steps=200)
+
+        config_dict = {'train_batch_size': 1}
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': ONE_CYCLE, 'params': {CYCLE_MIN_LR: 0, CYCLE_MAX_LR: 0.1}}
+            client_scheduler = None
+        elif scheduler_type == _LRScheduler:
+            client_scheduler = StepLR(client_optimizer, step_size=30)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                        model=model,
+                                                        model_parameters=list(model.parameters()),
+                                                        optimizer=client_optimizer,
+                                                        lr_scheduler=client_scheduler)
+        if scheduler_type is None:
+            assert isinstance(ds_lr_scheduler, OneCycle)
+        elif scheduler_type == _LRScheduler:
+            assert isinstance(ds_lr_scheduler, StepLR)
+        else:
+            # callable
+            assert isinstance(ds_lr_scheduler, OneCycleLR)
diff --git a/tests/unit/runtime/zero/test_unwrap_model.py b/tests/unit/runtime/zero/test_unwrap_model.py
new file mode 100644
index 0000000000000..d75519b67f683
--- /dev/null
+++ b/tests/unit/runtime/zero/test_unwrap_model.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+from deepspeed.runtime.zero import unwrap_model_for_generation
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+
+config = {
+    "train_batch_size": 2,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": True
+        }
+    }
+}
+
+if get_accelerator().is_fp16_supported():
+    config["fp16"] = {"enabled": True, "loss_scale": 138.}
+elif get_accelerator().is_bf16_supported():
+    config["bf16"] = {"enabled": True}
+
+
+class TestUnwrapModel(DistributedTest):
+    # gather across more than 1 gpu
+    world_size = 2
+
+    def test(self):
+
+        def hooks_exist(engine):
+            if engine.optimizer is not None and hasattr(engine.optimizer, "parameter_offload"):
+                optimizer_offload = engine.optimizer.parameter_offload
+            elif engine.optimizer is not None:
+                optimizer_offload = engine.optimizer
+
+            hooks = 0
+            for hook in optimizer_offload.forward_hooks:
+                hooks += 1
+            if hooks > 0:
+                return True
+            return False
+
+        model = SimpleModel(hidden_dim=100)
+        engine, _, _, _ = deepspeed.initialize(args=None, model=model, config=config)
+
+        with unwrap_model_for_generation(engine):
+            # assert no hooks
+            assert not hooks_exist(engine)
+            # assert parameters gathered
+            assert model.linears[0].weight.numel() != 0, "GatheredParameters should give a non-0-sized tensor"
+
+        # assert hooks
+        assert hooks_exist(engine)
diff --git a/tests/unit/runtime/zero/test_zero_offloadpp.py b/tests/unit/runtime/zero/test_zero_offloadpp.py
index 5bfec399e19f1..8ae99e2237e20 100644
--- a/tests/unit/runtime/zero/test_zero_offloadpp.py
+++ b/tests/unit/runtime/zero/test_zero_offloadpp.py
@@ -43,6 +43,7 @@ def test(self, h_dim: int, n_layers: int) -> None:
         config_dict = {
             "train_batch_size": 256,
             "steps_per_print": 1,
+            "gradient_clipping": 1.0,
             "optimizer": {
                 "type": "Adam",
                 "params": {
diff --git a/tests/unit/util.py b/tests/unit/util.py
index feec326ede6c0..dba29ed27a4cb 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -5,6 +5,8 @@
 
 import pytest
 import torch
+
+import deepspeed
 from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported
 from deepspeed.git_version_info import torch_info
 
@@ -67,3 +69,22 @@ def required_amp_check():
         return False
     else:
         return True
+
+
+class no_child_process_in_deepspeed_io:
+
+    def __enter__(self):
+        # deepspeed_io defaults to creating a dataloader that uses a
+        # multiprocessing pool. Our tests use pools and we cannot nest pools in
+        # python. Therefore we're injecting this kwarg to ensure that no pools
+        # are used in the dataloader.
+        self.old_method = deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io
+
+        def new_method(*args, **kwargs):
+            kwargs["num_local_io_workers"] = 0
+            return self.old_method(*args, **kwargs)
+
+        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = new_method
+
+    def __exit__(self, *_):
+        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = self.old_method
diff --git a/version.txt b/version.txt
index 436d0ce0df76b..226468ee5b2ef 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.5
+0.14.6